################################################################################
## LOAD PACKAGES
################################################################################

library(tidyverse)        # A collection of R packages for data manipulation and visualization (includes ggplot2, dplyr, etc.)
library(dplyr)            # For data manipulation (e.g., filtering, grouping, summarizing)
library(forcats)          # For working with factors (e.g., reordering, modifying levels)
library(stringr)          # For string manipulation (e.g., pattern matching, cleaning)
library(janitor)          # For data cleaning (e.g., cleaning column names)
library(data.table)       # For fast and efficient data manipulation, especially with large datasets
library(lubridate)        # For date-time manipulation and conversion
library(ggtext)           # For enhanced text rendering in ggplot (e.g., HTML/Markdown support)
library(ggimage)          # For adding images into ggplot2 plots (e.g., logos or icons)
library(grid)             # For customizing and combining graphical objects
library(cowplot)          # For combining ggplot figures and extracting legends
library(scales)           # For custom scales in ggplot2 (e.g., axis formatting)
library(factoextra)       # For visualizing multivariate analysis (PCA, clustering, etc.)
library(corrplot)         # For visualizing correlation matrices
library(broom)            # For converting model outputs into tidy data frames
library(psych)            # For psychometric and multivariate analysis (e.g., factor analysis, reliability)
library(EFAtools)         # For exploratory factor analysis and related tools
library(EFA.dimensions)   # For dimension estimation in exploratory factor analysis
library(GPArotation)      # For factor rotation methods (e.g., Varimax, Promax)
library(lavaan)           # For structural equation modeling (SEM), including CFA
library(semPlot)          # For visualizing SEM models
library(semptools)        # For customizing SEM diagrams and enhancing semPlot visuals
#remotes::install_version("mirt", version = "1.43") # Previous version of mirt needed
library(mirt)             # For multidimensional item response theory (IRT) modeling
library(forecast)         # For time series analysis and forecasting models (e.g., ARIMA)
library(vegan)            # For ecological and multivariate data analysis (e.g., ordination, diversity indices)
library(readxl)           # For reading Excel files (.xlsx and .xls formats)
library(writexl)          # For writing Excel files (.xlsx format)
#library(xlsx)             # For reading and writing older Excel files (.xls/.xlsx)
library(xtable)           # For creating LaTeX or HTML tables from data frames
library(gtools)           # Miscellaneous R programming tools (e.g., mixedSort, permutations)

################################################################################
## SET WORKING DIRECTORY
################################################################################

getwd()

#setwd() # Set working directory here

################################################################################
## LOAD DATA: CITIZENS' OPINIONS
################################################################################

# Load data

stemtest.youth.nl.df <- read.csv("stemtest_youth_nl_df.csv")

################################################################################
## FIND OUTLIERS (DURATION)
################################################################################

# Remove sessions lasting 70 seconds or less
stemtest.youth.nl.df <- stemtest.youth.nl.df %>% filter(duration > 70000)

# Check summary description of duration before removing outliers
stemtest.youth.nl.duration.w.outliers <- summary(stemtest.youth.nl.df$duration)

# Calculate Q1 (25th percentile) and Q3 (75th percentile) of duration
stemtest.youth.nl.q1 <- quantile(stemtest.youth.nl.df$duration, 0.25)
stemtest.youth.nl.q3 <- quantile(stemtest.youth.nl.df$duration, 0.75)

# Calculate the interquartile range (IQR)
irq.youth.nl <- stemtest.youth.nl.q3 - stemtest.youth.nl.q1

# Define lower and upper bounds for outliers (using 1.5 * IQR method)
lower.bound.youth.nl <- stemtest.youth.nl.q1 - 1.5 * irq.youth.nl
upper.bound.youth.nl <- stemtest.youth.nl.q3 + 1.5 * irq.youth.nl

# Identify outliers by checking if the duration is outside the defined bounds
stemtest.youth.nl.df$outlier <- stemtest.youth.nl.df$duration < lower.bound.youth.nl | stemtest.youth.nl.df$duration > upper.bound.youth.nl

# Show a summary of the outlier status (TRUE for outliers, FALSE for non-outliers)
table(stemtest.youth.nl.df$outlier)

# Drop rows that are outliers (outlier == FALSE)
stemtest.youth.nl.df <- stemtest.youth.nl.df %>% filter(outlier == "FALSE") 

# Check summary description of duration after removing outliers
stemtest.youth.nl.duration.wo.outliers <- summary(stemtest.youth.nl.df$duration)

# Remove sessions lasting less than 180 seconds
stemtest.youth.nl.df <- stemtest.youth.nl.df %>% filter(duration >= 180000)

################################################################################
## PERCENTAGE OF EACH ANSWER CATEGORY
################################################################################

# Get columns ending in "_opinion"
opinion.cols <- grep("_opinion$", names(stemtest.youth.nl.df), value = TRUE)

# Subset the dataframe to only _opinion columns
stemtest.youth.nl.opinion.df <- stemtest.youth.nl.df[, opinion.cols]

# Total number of "undecided", "agree", and "disagree"
stemtest.youth.nl.skipped <- sum(stemtest.youth.nl.opinion.df == "undecided", na.rm = TRUE)
stemtest.youth.nl.agree <- sum(stemtest.youth.nl.opinion.df == "agree", na.rm = TRUE)
stemtest.youth.nl.disagree <- sum(stemtest.youth.nl.opinion.df == "disagree", na.rm = TRUE)

# Total number of responses in _opinion columns
stemtest.youth.nl.total <- nrow(stemtest.youth.nl.opinion.df) * ncol(stemtest.youth.nl.opinion.df)

# Percentages
stemtest.youth.nl.pct.skipped <- (stemtest.youth.nl.skipped / stemtest.youth.nl.total) * 100
stemtest.youth.nl.pct.agree <- (stemtest.youth.nl.agree / stemtest.youth.nl.total) * 100
stemtest.youth.nl.pct.disagree <- (stemtest.youth.nl.disagree / stemtest.youth.nl.total) * 100

# Print
stemtest.youth.nl.pct.skipped
stemtest.youth.nl.pct.agree
stemtest.youth.nl.pct.disagree

# Average number of boosted statements per respondent
stemtest.youth.nl.avg.skipped <- mean(rowSums(stemtest.youth.nl.opinion.df == "undecided"))
stemtest.youth.nl.avg.skipped

################################################################################
## PERCENTAGE OF SKIPPED ITEMS: GRAPH
################################################################################

# Load labels to rename the columns and rows of the correlation matrix
labels.df <- read_excel("labels.xlsx", col_names = TRUE)

# Step 1: Calculate proportion of "undecided"
undecided.proportions <- stemtest.youth.nl.df %>%
  select(all_of(opinion.cols)) %>%
  summarise(across(everything(), ~mean(. == "undecided", na.rm = TRUE))) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "undecided_proportion")

# Step 2: Extract item codes
undecided.proportions <- undecided.proportions %>%
  mutate(code = str_remove(variable, "_opinion"))

# Step 3: Join with labels.df
undecided.proportions <- undecided.proportions %>%
  left_join(labels.df, by = "code")

# Step 4: Format label as "COLORED new_column: rest of label"
undecided.proportions <- undecided.proportions %>%
  mutate(
    label_colored = paste0(
      "<span style='color:", colors, ";'><b>", toupper(abbr), "</b></span>. ", label
    )
  )

# Step 5: Create plot and save to file
png("figs/youth_nl_undecided_plot.png", units = "in", width = 10, height = 8, res = 300)

ggplot(undecided.proportions, aes(y = reorder(label_colored, -undecided_proportion), x = undecided_proportion)) +
  geom_col(fill = "grey40") +
  theme_minimal() +
  labs(
    y = NULL,                        # Remove y-axis title
    x = "Proportion of skipped items"     # Keep x-axis title
    # No plot title
  ) +
  theme(
    axis.text.y = element_markdown(size = 10.5),
    axis.text.x = element_text(size = 13),
    axis.title.x = element_text(margin = margin(t = 15, b = 5), size = 14),  # Larger margin above/below x axis title
    plot.title = element_blank()   # Remove plot title if any
  )

dev.off()

################################################################################
## PARTY ADVICE
################################################################################

# Set seed
set.seed(123)

# Clean the dataset by removing any rows with missing values and sample 100,000 rows randomly
stemtest.youth.nl.df.100k <- stemtest.youth.nl.df[complete.cases(stemtest.youth.nl.df), ][sample(100000), ]

# Create a new dataframe that excludes salience columns (columns containing "_boosted")
stemtest.youth.nl.df.opinion <- select(stemtest.youth.nl.df.100k, -contains("_boosted")) #remove salience columns

# Clean column names by removing "_opinion" from them for clarity
stemtest.youth.nl.df.opinion <- stemtest.youth.nl.df.opinion %>% 
  rename_with(~ str_remove(., "_opinion"), everything()) #clean column names (remove "_opinion" string)

# Add a new column to represent the variable as "Agreement"
stemtest.youth.nl.df.opinion$variable <- "Agreement" #create variable column

# Create a separate dataframe for the salience data (removing agreement columns)
stemtest.youth.nl.df.boosted <- select(stemtest.youth.nl.df.100k, -contains("_opinion")) #remove agreement columns

# Clean column names by removing "_boosted" to make them clearer
stemtest.youth.nl.df.boosted <- stemtest.youth.nl.df.boosted %>% 
  rename_with(~ str_remove(., "_boosted"), everything()) #clean column names (remove "_boosted" string)

# Add a new column to represent the variable as "Salience"
stemtest.youth.nl.df.boosted$variable <- "Salience" #create variable column

# Combine the two dataframes (agreement and salience) into one
stemtest.youth.nl.df.both <- rbind(stemtest.youth.nl.df.opinion, stemtest.youth.nl.df.boosted)

# Save the combined dataframe to a file for future use
saveRDS(stemtest.youth.nl.df.both, "files/stemtest_youth_nl_df_both.rds")

# Transform the dataframe to long format, gathering specific columns into two key variables: 'code' and 'value'
stemtest.youth.nl.df.long <- tidyr::pivot_longer(
  stemtest.youth.nl.df.both,
  cols = dplyr::starts_with("ST"),
  names_to = "code",
  values_to = "value",
  names_transform = list(code = as.factor)
)

# Select only necessary columns from the long format dataframe
stemtest.youth.nl.df.long <- stemtest.youth.nl.df.long %>%
  dplyr::select(id, code, variable, value)

# Define the column types for 'code', 'variable', and 'value'
stemtest.youth.nl.df.long$code <- as.factor(stemtest.youth.nl.df.long$code)
stemtest.youth.nl.df.long$variable <- as.factor(stemtest.youth.nl.df.long$variable)
stemtest.youth.nl.df.long$value <- as.factor(stemtest.youth.nl.df.long$value)

# Remove rows where the 'value' column is empty or contains an empty string
stemtest.youth.nl.df.long <- subset(stemtest.youth.nl.df.long, value != "")

# Load external data (merged dataframe) and rename the column 'SID' to 'code' to match with the other dataframe
youth.nl.merge <- readRDS("youth_nl_merge.rds")
youth.nl.merge <- rename(youth.nl.merge, c('code'='SID'))

# Merge the two dataframes (stemtest data and external merged data) based on the 'code' column
stemtest.youth.nl.df.long <- merge(stemtest.youth.nl.df.long, youth.nl.merge, by="code") #merge by statement code

# Convert columns from 'CDV_position' to 'VOORUIT_position' to numeric
stemtest.youth.nl.df.long <- stemtest.youth.nl.df.long %>% 
  mutate(across(CDV_position:VOORUIT_position, as.numeric))

# Create a new column 'agree' initialized as NA (missing values)
stemtest.youth.nl.df.long$agree <- NA

# Assign numeric values based on the 'value' column ('agree', 'disagree', 'undecided')
stemtest.youth.nl.df.long$agree[stemtest.youth.nl.df.long$value=="agree"] <- 1
stemtest.youth.nl.df.long$agree[stemtest.youth.nl.df.long$value=="disagree"] <- 2
stemtest.youth.nl.df.long$agree[stemtest.youth.nl.df.long$value=="undecided"] <- 0

# Assign 0 or 1 to the 'agree' column based on boolean values (TRUE/FALSE)
stemtest.youth.nl.df.long$agree[stemtest.youth.nl.df.long$value=="FALSE"] <- 0
stemtest.youth.nl.df.long$agree[stemtest.youth.nl.df.long$value=="TRUE"] <- 1

# Filter the data for salience rows (where variable is 'Salience')
stemtest.youth.nl.df.long.1 <- stemtest.youth.nl.df.long %>% 
  filter(variable == "Salience") 

# Filter the data for agreement rows (where variable is 'Agreement') and select only relevant columns
stemtest.youth.nl.df.long.2 <- stemtest.youth.nl.df.long %>% 
  filter(variable=="Agreement") %>% subset(select = c("id","code","agree"))

# Combine the two dataframes (salience and agreement) based on common 'id' and 'code' columns
stemtest.youth.nl.df.long <- stemtest.youth.nl.df.long.1 %>% 
  right_join(stemtest.youth.nl.df.long.2, by=c("id","code"))

# Rename columns to avoid name conflicts after joining
stemtest.youth.nl.df.long <- rename(stemtest.youth.nl.df.long, c('boosted'='agree.x',
                                                                     'agree'='agree.y'))

# Create a new dataframe with scores by summing the 'boosted' values for each 'id'
stemtest.scores.youth.nl <- stemtest.youth.nl.df.long %>% 
  group_by(id) %>% 
  mutate(sum_boosted = sum(boosted)) %>%
  ungroup()

# Calculate score for each party based on specific conditions
stemtest.scores.youth.nl <- stemtest.scores.youth.nl %>%
  mutate(CDV_score = case_when(agree==CDV_position & boosted==1 ~ stemtest.scores.youth.nl$CDV + 20/stemtest.scores.youth.nl$sum_boosted,
                               agree==CDV_position & boosted==0 ~ stemtest.scores.youth.nl$CDV + 0,
                               agree!=CDV_position ~ 0))

stemtest.scores.youth.nl <- stemtest.scores.youth.nl %>%
  mutate(GROEN_score = case_when(agree==GROEN_position & boosted==1 ~ stemtest.scores.youth.nl$GROEN + 20/stemtest.scores.youth.nl$sum_boosted,
                                 agree==GROEN_position & boosted==0 ~ stemtest.scores.youth.nl$GROEN + 0,
                                 agree!=GROEN_position ~ 0))

stemtest.scores.youth.nl <- stemtest.scores.youth.nl %>%
  mutate(NVA_score = case_when(agree==NVA_position & boosted==1 ~ stemtest.scores.youth.nl$NVA + 20/stemtest.scores.youth.nl$sum_boosted,
                               agree==NVA_position & boosted==0 ~ stemtest.scores.youth.nl$NVA + 0,
                               agree!=NVA_position ~ 0))

stemtest.scores.youth.nl <- stemtest.scores.youth.nl %>%
  mutate(VLD_score = case_when(agree==VLD_position & boosted==1 ~ stemtest.scores.youth.nl$VLD + 20/stemtest.scores.youth.nl$sum_boosted,
                               agree==VLD_position & boosted==0 ~ stemtest.scores.youth.nl$VLD + 0,
                               agree!=VLD_position ~ 0))

stemtest.scores.youth.nl <- stemtest.scores.youth.nl %>%
  mutate(PVDA_score = case_when(agree==PVDA_position & boosted==1 ~ stemtest.scores.youth.nl$PVDA + 20/stemtest.scores.youth.nl$sum_boosted,
                                agree==PVDA_position & boosted==0 ~ stemtest.scores.youth.nl$PVDA + 0,
                                agree!=PVDA_position ~ 0))

stemtest.scores.youth.nl <- stemtest.scores.youth.nl %>%
  mutate(VB_score = case_when(agree==VB_position & boosted==1 ~ stemtest.scores.youth.nl$VB + 20/stemtest.scores.youth.nl$sum_boosted,
                              agree==VB_position & boosted==0 ~ stemtest.scores.youth.nl$VB + 0,
                              agree!=VB_position ~ 0))

stemtest.scores.youth.nl <- stemtest.scores.youth.nl %>%
  mutate(VOORUIT_score = case_when(agree==VOORUIT_position & boosted==1 ~ stemtest.scores.youth.nl$VOORUIT + 20/stemtest.scores.youth.nl$sum_boosted,
                                   agree==VOORUIT_position & boosted==0 ~ stemtest.scores.youth.nl$VOORUIT + 0,
                                   agree!=VOORUIT_position ~ 0))

# Group by 'id' and sum the scores for each position
stemtest.scores.youth.nl <- stemtest.scores.youth.nl %>%
  group_by(id) %>% 
  mutate(across(CDV_score:VOORUIT_score, sum)) %>%
  ungroup()

# Remove duplicate rows based on 'id' column while keeping the first occurrence
stemtest.scores.youth.nl <- distinct(stemtest.scores.youth.nl, id, .keep_all = TRUE)

# Rescale the scores by dividing by the sum of boosted responses and scaling to a percentage
stemtest.scores.youth.nl$CDV_rescaled <- (stemtest.scores.youth.nl$CDV_score/(100 + stemtest.scores.youth.nl$sum_boosted))*100
stemtest.scores.youth.nl$GROEN_rescaled <- (stemtest.scores.youth.nl$GROEN_score/(100 + stemtest.scores.youth.nl$sum_boosted))*100
stemtest.scores.youth.nl$NVA_rescaled <- (stemtest.scores.youth.nl$NVA_score/(100 + stemtest.scores.youth.nl$sum_boosted))*100
stemtest.scores.youth.nl$VLD_rescaled <- (stemtest.scores.youth.nl$VLD_score/(100 + stemtest.scores.youth.nl$sum_boosted))*100
stemtest.scores.youth.nl$PVDA_rescaled <- (stemtest.scores.youth.nl$PVDA_score/(100 + stemtest.scores.youth.nl$sum_boosted))*100
stemtest.scores.youth.nl$VB_rescaled <- (stemtest.scores.youth.nl$VB_score/(100 + stemtest.scores.youth.nl$sum_boosted))*100
stemtest.scores.youth.nl$VOORUIT_rescaled <- (stemtest.scores.youth.nl$VOORUIT_score/(100 + stemtest.scores.youth.nl$sum_boosted))*100

# Identify the rescaled columns
stemtest.rescaled.cols.youth.nl <- grep("_rescaled", colnames(stemtest.scores.youth.nl), value = TRUE)

# For each row, find the column with the maximum rescaled value and assign the political position to 'advice'
stemtest.advice.youth.nl <- stemtest.rescaled.cols.youth.nl[
  apply(stemtest.scores.youth.nl[, stemtest.rescaled.cols.youth.nl], 1, which.max)
]

# Assign the best advice (political position) to each row in the 'advice' column
stemtest.scores.youth.nl$advice <- stemtest.advice.youth.nl

# Clean the 'advice' column by removing the suffix (e.g., "CDV_rescaled" becomes "CDV")
stemtest.scores.youth.nl$advice <- sub("_.*", "", stemtest.scores.youth.nl$advice)

# Select only relevant columns
stemtest.scores.youth.nl <- stemtest.scores.youth.nl %>% subset(select = c("id","advice"))

# Merge party advice to wide dataframe
stemtest.youth.nl.df.opinion <- merge(stemtest.youth.nl.df.opinion, stemtest.scores.youth.nl, by="id") 

# Recode opinion values ("agree" = 1, "disagree" = 0, "undecided" = NA)
stemtest.youth.nl.df.opinion <- stemtest.youth.nl.df.opinion %>%
  mutate(across(starts_with("ST"), ~ recode(.x, "agree" = "1", "disagree" = "0", "undecided" = "")))

# Identify policy item columns (those starting with "ST")
stemtest.youth.nl.policy.cols <- names(stemtest.youth.nl.df.opinion)[grepl("^ST", names(stemtest.youth.nl.df.opinion))]

# Convert all ST columns to numeric (in case they are factors or characters)
stemtest.youth.nl.df.opinion[stemtest.youth.nl.policy.cols] <- lapply(
  stemtest.youth.nl.df.opinion[stemtest.youth.nl.policy.cols],
  function(x) as.numeric(as.character(x))
)

# Function to compute mean absolute tetrachoric correlation
avg_function <- function(data) {
  cor_matrix <- tetrachoric(data[, stemtest.youth.nl.policy.cols])$rho
  upper_vals <- cor_matrix[upper.tri(cor_matrix)]
  mean(abs(upper_vals), na.rm = TRUE)  # Take the absolute values before averaging
}

# Group by 'advice' (party) and compute mean absolute tetrachoric correlation
stemtest.youth.nl.avg.byparty <- stemtest.youth.nl.df.opinion %>%
  group_by(advice) %>%
  summarise(avg_abs_tetra = avg_function(cur_data_all()))

# Show the result
print(stemtest.youth.nl.avg.byparty)

################################################################################
## CREATE MAIN DATAFRAME FOR CITIZENS' OPINIONS
################################################################################

# Remove columns containing "_boosted" (such as salience-related columns)
stemtest.youth.nl.df <- select(stemtest.youth.nl.df, -contains("_boosted"))

# Clean column names by removing "_opinion" from all column names
stemtest.youth.nl.df <- stemtest.youth.nl.df %>% 
  rename_with(~ str_remove(., "_opinion"), everything())

# Recode opinion values ("agree" = 1, "disagree" = 0, "undecided" = NA)
stemtest.youth.nl.df <- stemtest.youth.nl.df %>%
  mutate(across(starts_with("ST"), ~ recode(.x, "agree" = "1", "disagree" = "0", "undecided" = "")))

# Convert opinion columns to numeric format
stemtest.youth.nl.df <- stemtest.youth.nl.df %>% 
  mutate(across(starts_with("ST"), as.numeric))

# Keep only the opinion columns, duration, and date
stemtest.youth.nl.df <- stemtest.youth.nl.df %>% 
  select(starts_with('ST'), duration, date)

# Remove columns that have only NA values
stemtest.youth.nl.df <- stemtest.youth.nl.df[, colSums(is.na(stemtest.youth.nl.df)) != nrow(stemtest.youth.nl.df)]

# Keep only complete cases (rows with no missing values)
#stemtest.youth.nl.df <- stemtest.youth.nl.df[complete.cases(stemtest.youth.nl.df), ]

# Select only the opinion columns for further analysis
stemtest.youth.nl.df <- stemtest.youth.nl.df %>% 
  select(starts_with('ST'))

################################################################################
## COMPUTE CORRELATIONS BY ISSUE CATEGORY
################################################################################

youth.themes.nl.df <- stemtest.youth.nl.df
youth.themes.nl.df <- na.omit(youth.themes.nl.df)

# Load the Excel file containing the 'code' and 'abbr' columns
themes.data <- read_excel("labels.xlsx")  # Replace 'labels.xlsx' with your actual file name if needed

# Assuming the themes.data has columns 'code' (e.g., ST1, ST2, ...) and 'abbr' (e.g., ECONOMY, EDUCATION)
# Create a mapping of the code to the abbr (theme)
youth.nl.theme.map <- setNames(themes.data$abbr, themes.data$code)

# Assuming your R dataframe is called 'youth.themes.nl.df' and contains columns like ST1, ST2, ..., ST348
# Rename the columns in the dataframe based on the youth.nl.theme.map
colnames(youth.themes.nl.df) <- sapply(colnames(youth.themes.nl.df), function(colname) {
  # Check if the column name is in the youth.nl.theme.map and modify accordingly
  if (colname %in% names(youth.nl.theme.map)) {
    return(paste(youth.nl.theme.map[colname], colname, sep = "_"))
  } else {
    return(colname)  # In case there is no theme for that particular code
  }
})

# Step 1: Identify unique prefixes (everything before "_")
youth.nl.prefixes <- unique(substring(names(youth.themes.nl.df), 1, regexpr("_", names(youth.themes.nl.df)) - 1))

# Step 2: Compute tetrachoric correlations for columns with the same prefix
for (prefix in youth.nl.prefixes) {
  # Step 2a: Subset the dataframe to include only columns with the same prefix
  youth.nl.cols.with.prefix <- grep(paste0("^", prefix, "_"), names(youth.themes.nl.df), value = TRUE)
  
  # If there are fewer than 2 columns, skip the calculation for this prefix
  if (length(youth.nl.cols.with.prefix) < 2) {
    cat("\nSkipping prefix", prefix, "because it has fewer than 2 columns.\n")
    next  # Skip to the next prefix
  }
  
  # Subset the dataframe
  youth.themes.nl.df.subset <- youth.themes.nl.df[, youth.nl.cols.with.prefix]
  
  # Step 2b: Compute the tetrachoric correlation for the subset of columns
  youth.nl.tetra.cor <- tetrachoric(youth.themes.nl.df.subset)
  
  # Get the correlation matrix (rho)
  youth.nl.corr.matrix <- youth.nl.tetra.cor$rho
  
  # Step 3: Compute the absolute correlation matrix
  youth.nl.abs.corr.matrix <- abs(youth.nl.corr.matrix)  # Get the absolute values of the correlation matrix
  
  # Remove diagonal values (1s)
  diag(youth.nl.abs.corr.matrix) <- NA
  
  # Step 4: Compute the average absolute correlation for this set of columns
  youth.nl.avg.abs.corr.matrix <- mean(youth.nl.abs.corr.matrix, na.rm = TRUE)
  
  # Step 5: Adjust the average correlation by the number of variables (columns)
  N <- length(youth.nl.cols.with.prefix)  # Number of variables (columns)
  youth.nl.avg.adj.corr <- youth.nl.avg.abs.corr.matrix / (N - 1)  # Adjust by N - 1 (for pairwise correlation)
  
  # Step 6: Apply Fisher Z-transformation to the absolute correlation matrix
  youth.nl.abs.corr.matrix <- abs(youth.nl.corr.matrix)  # Take absolute values of correlations first
  youth.nl.fisher.z.matrix <- 0.5 * log((1 + youth.nl.abs.corr.matrix) / (1 - youth.nl.abs.corr.matrix))
  
  # Step 7: Compute the average Fisher Z-transformed correlation (excluding diagonal)
  youth.nl.fisher.z.matrix[lower.tri(youth.nl.fisher.z.matrix)] <- NA  # Optional: Set lower triangle to NA to avoid duplicates
  diag(youth.nl.fisher.z.matrix) <- NA  # Remove diagonal values
  youth.nl.avg.fisher.z <- mean(youth.nl.fisher.z.matrix, na.rm = TRUE)
  
  # Step 8: Reverse Fisher Z-transformation to get back to the original correlation scale
  youth.nl.avg.fisher.corr <- (exp(2 * youth.nl.avg.fisher.z) - 1) / (exp(2 * youth.nl.avg.fisher.z) + 1)
  
  # Print the results: Average, Adjusted Average, and Fisher Z-transformed Average correlation
  cat("\nFor prefix", prefix, ":\n")
  cat("  Average absolute correlation: ", youth.nl.avg.abs.corr.matrix, "\n")
  cat("  Adjusted average absolute correlation: ", youth.nl.avg.adj.corr, "\n")
  cat("  Fisher Z-transformed average correlation (absolute): ", youth.nl.avg.fisher.corr, "\n")
}

# Same but save in dataframe
# Initialize list to collect results
youth.nl.correlation.results <- list()

for (prefix in youth.nl.prefixes) {
  youth.nl.cols.with.prefix <- grep(paste0("^", prefix, "_"), names(youth.themes.nl.df), value = TRUE)
  
  if (length(youth.nl.cols.with.prefix) < 2) {
    cat("\nSkipping prefix", prefix, "because it has fewer than 2 columns.\n")
    next
  }
  
  youth.themes.nl.df.subset <- youth.themes.nl.df[, youth.nl.cols.with.prefix]
  youth.nl.tetra.cor <- tetrachoric(youth.themes.nl.df.subset)
  youth.nl.corr.matrix <- youth.nl.tetra.cor$rho
  
  youth.nl.abs.corr.matrix <- abs(youth.nl.corr.matrix)
  diag(youth.nl.abs.corr.matrix) <- NA
  youth.nl.avg.abs.corr.matrix <- mean(youth.nl.abs.corr.matrix, na.rm = TRUE)
  N <- length(youth.nl.cols.with.prefix)
  youth.nl.avg.adj.corr <- youth.nl.avg.abs.corr.matrix / (N - 1)
  
  youth.nl.fisher.z.matrix <- 0.5 * log((1 + youth.nl.abs.corr.matrix) / (1 - youth.nl.abs.corr.matrix))
  youth.nl.fisher.z.matrix[lower.tri(youth.nl.fisher.z.matrix)] <- NA
  diag(youth.nl.fisher.z.matrix) <- NA
  youth.nl.avg.fisher.z <- mean(youth.nl.fisher.z.matrix, na.rm = TRUE)
  youth.nl.avg.fisher.corr <- (exp(2 * youth.nl.avg.fisher.z) - 1) / (exp(2 * youth.nl.avg.fisher.z) + 1)
  
  # Append results
  youth.nl.correlation.results[[length(youth.nl.correlation.results) + 1]] <- list(
    theme = prefix,
    avg_abs_corr = youth.nl.avg.abs.corr.matrix,
    adj_avg_abs_corr = youth.nl.avg.adj.corr,
    fisher_z_corr = youth.nl.avg.fisher.corr
  )
}

# Convert list to dataframe
youth.nl.correlations.df <- do.call(rbind, lapply(youth.nl.correlation.results, as.data.frame))

# Save to Excel
write_xlsx(youth.nl.correlations.df, "files/youth_nl_correlations_df.xlsx")

################################################################################
## LOAD DATA: PARTY POSITIONS
################################################################################

# Load the party positions data from an Excel sheet
youth.nl.positions <- read_excel("positions.xlsx", sheet = "youth_nl")

# Reshape the data from wide to long format (for easier analysis)
youth.nl.positions.long <- youth.nl.positions %>%
  pivot_longer(
    cols = starts_with("party"),  # Specify the columns to pivot
    names_to = "party",           # New column for the former column names
    values_to = "value"           # New column for the values
  )

# Reshape data from long format to wide format using pivot_wider
youth.nl.positions.wide <- youth.nl.positions.long %>% 
  dplyr::select(-statement) %>%  # Remove the 'statement' column
  pivot_wider(
    names_from = SID,            # Use 'SID' to create new column names
    values_from = value          # Use 'value' to fill the new columns
  )

################################################################################
## KEEP COMMON COLUMNS
################################################################################

# Remove columns with constant values
stemtest.youth.nl.df <- stemtest.youth.nl.df[, sapply(stemtest.youth.nl.df, function(x) length(unique(x)) > 1)]
youth.nl.positions.wide <- youth.nl.positions.wide[, sapply(youth.nl.positions.wide, function(x) length(unique(x)) > 1)]

# Remove columns that contain one or more '99' values
youth.nl.positions.wide <- youth.nl.positions.wide[, !apply(youth.nl.positions.wide, 2, function(x) any(x == 99))]

# Find the common column names between the dataframes 'stemtest.youth.nl.df' and 'youth.nl.positions.wide'
youth.nl.common.columns <- intersect(names(stemtest.youth.nl.df), names(youth.nl.positions.wide))

# Subset the wide dataframe to keep only the common columns
youth.nl.positions.wide <- youth.nl.positions.wide[, youth.nl.common.columns]
stemtest.youth.nl.df.common <- stemtest.youth.nl.df[, youth.nl.common.columns]

################################################################################
## CREATE MAIN DATAFRAME FOR PARTY POSITIONS
################################################################################

# Recode values in the position columns: Replace '2' with '0'
youth.nl.positions.wide <- youth.nl.positions.wide %>%
  mutate(across(everything(), ~ replace(., . == 2, 0)))

# Convert all opinion columns (of type double) to numeric format
youth.nl.positions.wide <- youth.nl.positions.wide %>%
  mutate(across(where(is.double), as.numeric))

################################################################################
## CREATE CORRELATION MATRIX: CITIZENS
################################################################################

# Count the total number of sessions in the data
n.stemtest.youth.nl.df <- nrow(stemtest.youth.nl.df.common)

# Compute Kuder-Richardson Formula 20 (reliability coefficient)
#kr20_value <- validateR::kr20(stemtest.youth.nl.df.common)

# Compute tetrachoric correlation for the opinion data
youth.nl.tetrachoric.corr <- tetrachoric(stemtest.youth.nl.df.common)

# Extract the correlation matrix and convert it to absolute values
youth.nl.cor.matrix  <- abs(youth.nl.tetrachoric.corr$rho)

# Calculate the average absolute correlation for each row
youth.nl.avg.abs.corr.matrix <- rowMeans(youth.nl.cor.matrix )

# Convert the correlation matrix to long format (for easier visualization)
youth.nl.cor.matrix.long <- reshape2::melt(youth.nl.cor.matrix )
youth.nl.cor.matrix.long$Matrix <- "(a) Citizens"  # Label for the first matrix

# Perform hierarchical clustering on the correlation matrix
youth.nl.hc <- hclust(as.dist(1 - youth.nl.cor.matrix ), method = "complete")

# Get the order of items after hierarchical clustering
youth.nl.ordered.names <- rownames(youth.nl.cor.matrix )[youth.nl.hc$order]

# Reorder the correlation matrix according to the clustering order
youth.nl.cor.matrix.reordered  <- youth.nl.cor.matrix [youth.nl.hc$order, youth.nl.hc$order]

# Load labels to rename the columns and rows of the correlation matrix
labels.df <- read_excel("labels.xlsx", col_names = TRUE)

# Clean up the labels data (add new codes and labels)
labels.df <- labels.df %>%
  #mutate(new_column = paste0("T", as.integer(factor(category)))) %>%
  mutate(new_column = paste0(abbr)) %>%
  mutate(
    new_code = paste0(new_column, ". ", code),
    new_label = paste0(new_column, ". ", label)
  ) %>%
  arrange(category)

# Merge the labels with the correlation matrix data
youth.nl.cor.matrix.long <- merge(youth.nl.cor.matrix.long, labels.df, by.x = "Var1", by.y = "code")
youth.nl.cor.matrix.long <- merge(youth.nl.cor.matrix.long, labels.df, by.x = "Var2", by.y = "code")

# Label the rows and columns of the reordered correlation matrix
labels <- labels.df$label[match(colnames(youth.nl.cor.matrix.reordered ), labels.df$code)]
rownames(youth.nl.cor.matrix.reordered ) <- labels

# Perform hypothesis testing to compute p-values for the correlations
youth.nl.ptest <- cor.mtest(youth.nl.cor.matrix.reordered , conf.level = 0.95)

# Calculate the proportion of statistically significant correlations (p-value < 0.05)
youth.nl.mean.sig <- mean(youth.nl.ptest$p[upper.tri(youth.nl.ptest$p)] < 0.05)

# Set diagonal values of the correlation matrix to NA (self-correlations should be ignored)
youth.nl.cor.matrix.reordered.diag <- youth.nl.cor.matrix.reordered 
diag(youth.nl.cor.matrix.reordered.diag) <- NA

# Compute the average of the non-diagonal absolute correlations
youth.nl.mean.abs.corr <- mean(youth.nl.cor.matrix.reordered.diag, na.rm = TRUE)

# Flatten the matrix into a vector for easier analysis
youth.nl.cor.vector <- as.vector(youth.nl.cor.matrix.reordered.diag)

# Count the number of correlations with absolute value greater than or equal to 0.4 and 0.2
youth.nl.num.above.0.4 <- sum(abs(youth.nl.cor.vector) >= 0.4, na.rm = TRUE)
youth.nl.num.above.0.2 <- sum(abs(youth.nl.cor.vector) >= 0.2, na.rm = TRUE)

# Calculate the total number of correlations
youth.nl.num.total <- length(youth.nl.cor.vector)

# Calculate the percentage of correlations above the thresholds
youth.nl.percentage.above.0.4 <- (youth.nl.num.above.0.4 / youth.nl.num.total) * 100
youth.nl.percentage.above.0.2 <- (youth.nl.num.above.0.2 / youth.nl.num.total) * 100

################################################################################
## CREATE CORRELATION MATRIX: PARTIES
################################################################################

# Apply the Kuder-Richardson Formula 20 to validate the data
#pkr20_value <- validateR::kr20(youth.nl.positions.wide)

# Compute the tetrachoric correlation matrix
youth.nl.tetrachoric.corr.p <- tetrachoric(youth.nl.positions.wide)
youth.nl.cor.matrix.p  <- youth.nl.tetrachoric.corr.p$rho

# Take the absolute values of the correlation matrix
youth.nl.cor.matrix.p  <- abs(youth.nl.cor.matrix.p )

# Convert the correlation matrix to long format for easier manipulation
youth.nl.cor.matrix.p.long <- reshape2::melt(abs(youth.nl.cor.matrix.p ))
youth.nl.cor.matrix.p.long$Matrix <- "(b) Party leaders"  # Label the matrix for visualization

# Merge with 'labels.df' to get descriptive labels for the columns/rows
youth.nl.cor.matrix.p.long <- merge(youth.nl.cor.matrix.p.long, labels.df, by.x = "Var1", by.y = "code")
youth.nl.cor.matrix.p.long <- merge(youth.nl.cor.matrix.p.long, labels.df, by.x = "Var2", by.y = "code")

# Perform hierarchical clustering on the correlation matrix
youth.nl.hc.p <- hclust(as.dist(1 - youth.nl.cor.matrix.p ), method = "complete")

# Reorder the correlation matrix based on the hierarchical clustering order
youth.nl.cor.matrix.p.reordered  <- youth.nl.cor.matrix.p [youth.nl.ordered.names, youth.nl.ordered.names]

# Assign labels to the reordered matrix columns and rows based on 'labels.df'
labels <- labels.df$label[match(colnames(youth.nl.cor.matrix.p.reordered ), labels.df$code)]
rownames(youth.nl.cor.matrix.p.reordered ) <- labels

# Compute the confidence intervals for each pair of items in the correlation matrix
pitemsCols <- 1:ncol(youth.nl.cor.matrix.p.reordered )
youth.nl.ptest.p <- cor.mtest(youth.nl.cor.matrix.p.reordered [ , pitemsCols], conf.level = 0.95)

# Calculate the proportion of statistically significant correlations
youth.nl.mean.sig.p <- mean(youth.nl.ptest.p$p[upper.tri(youth.nl.ptest.p$p)] < 0.05)

# Set diagonal values of the matrix to NA (ignores self-correlations)
youth.nl.cor.matrix.p.reordered.diag <- youth.nl.cor.matrix.p.reordered 
diag(youth.nl.cor.matrix.p.reordered.diag) <- NA

# Compute the average of the non-diagonal absolute correlations
youth.nl.mean.abs.corr.p <- mean(youth.nl.cor.matrix.p.reordered.diag, na.rm = TRUE)

# Flatten the matrix into a vector for further analysis
youth.nl.cor.vector.p <- as.vector(youth.nl.cor.matrix.p.reordered.diag)

# Count the number of correlations with an absolute value ≥ 0.4
youth.nl.num.above.0.4.p <- sum(abs(youth.nl.cor.vector.p) >= 0.4, na.rm = TRUE)

# Count the number of correlations with an absolute value ≥ 0.2
youth.nl.num.above.0.2.p <- sum(abs(youth.nl.cor.vector.p) >= 0.2, na.rm = TRUE)

# Calculate the total number of correlations
youth.nl.num.total.p <- length(youth.nl.cor.vector.p)

# Calculate the percentage of correlations that are ≥ 0.4 and ≥ 0.2
youth.nl.percentage.above.0.4.p <- (youth.nl.num.above.0.4.p / youth.nl.num.total.p) * 100
youth.nl.percentage.above.0.2.p <- (youth.nl.num.above.0.2.p / youth.nl.num.total.p) * 100

################################################################################
## CORRELATION BETWEEN THE TWO MATRICES
################################################################################

# Extract correlation matrices from tetrachoric objects
youth.nl.tetrachoric.matrix <- youth.nl.tetrachoric.corr$rho
youth.nl.tetrachoric.matrix.p <- youth.nl.tetrachoric.corr.p$rho

# Helper to flatten lower triangle
flatten_corr <- function(corr_matrix) {
  corr_matrix[lower.tri(corr_matrix)]
}

# 1. SCATTERPLOT OF CORRELATIONS WITH CORRELATION VALUE
youth.nl.vec.citizens <- flatten_corr(youth.nl.tetrachoric.matrix)
youth.nl.vec.parties <- flatten_corr(youth.nl.tetrachoric.matrix.p)
youth.nl.vec <- data.frame(Citizen = youth.nl.vec.citizens, Politician = youth.nl.vec.parties)

# Compute Pearson correlation
youth.nl.corr.val <- cor(youth.nl.vec$Citizen, youth.nl.vec$Politician, use = "complete.obs")

# Format correlation value to show two decimal places
youth.nl.corr.val <- sprintf("%.2f", youth.nl.corr.val)

png("figs/youth_nl_scatter_plot.png", units = "in", width = 10, height = 8, res = 300)

# Create the plot with improved styling
youth.nl.scatter.plot <- ggplot(youth.nl.vec, aes(x = Citizen, y = Politician)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, by = 0.2)) +  # Custom x-axis
  annotate("text", x = min(youth.nl.vec$Citizen, na.rm = TRUE) - 0.28,  # Move to left
           y = max(youth.nl.vec$Politician, na.rm = TRUE) + 0.05,  # Move to top
           label = paste0("r = ", youth.nl.corr.val),
           hjust = 0, vjust = 1, size = 4.5) +
  labs(x = "Citizens' Tetrachoric Correlations",
       y = "Party Leaders' Tetrachoric Correlations") +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_blank(),  # remove title
    axis.title.x = element_text(margin = margin(t = 15), size = 16),
    axis.title.y = element_text(margin = margin(r = 15), size = 16),
    axis.text = element_text(size = 13)
  )

youth.nl.scatter.plot

dev.off()

# 2. DIFFERENCE HEATMAP (clipped to [-1, 1])
par(mfrow = c(1, 1))  # Reset layout
youth.nl.diff.corr <- youth.nl.tetrachoric.matrix - youth.nl.tetrachoric.matrix.p
youth.nl.diff.corr.clipped <- pmax(pmin(youth.nl.diff.corr, 1), -1)  # clip values to [-1, 1]

corrplot(youth.nl.diff.corr.clipped, method = "color", tl.cex = 0.7, mar = c(0, 0, 2, 0))
#title("Citizen - Politician Correlations", line = 1)

################################################################################
## CREATE FACETED PLOT
################################################################################

# Combine the correlation matrices (from citizens and parties) into a single long-format dataframe
youth.nl.long.combined <- rbind(youth.nl.cor.matrix.long, youth.nl.cor.matrix.p.long)

# Select and organize necessary columns for further analysis and visualization
youth.nl.long.combined <- youth.nl.long.combined %>%
  select(new_code.x, new_code.y, new_label.x, new_label.y, value, category.x, category.y, Matrix)

# Convert codes to factors for better handling of categorical data
youth.nl.long.combined$new_code.x <- as.factor(youth.nl.long.combined$new_code.x)
youth.nl.long.combined$new_code.y <- as.factor(youth.nl.long.combined$new_code.y)

# Calculate the average absolute correlation for each row (excluding diagonal)
youth.nl.long.combined <- youth.nl.long.combined %>%
  group_by(new_code.x, Matrix) %>%
  mutate(avg_corr = mean(abs(value[new_code.x != new_code.y]), na.rm = TRUE)) %>%
  ungroup()

# Prepare data for labeling the average correlation values at the end of each row
youth.nl.avg.corr.labels <- youth.nl.long.combined %>%
  distinct(new_code.x, avg_corr, Matrix) %>%
  mutate(x_position = max(as.numeric(youth.nl.long.combined$new_code.y)))

# Merge with 'labels.df' to get descriptive labels for the codes
youth.nl.avg.corr.labels <- youth.nl.avg.corr.labels %>%
  mutate(new_code.x = as.character(new_code.x)) %>% 
  left_join(labels.df, by = c("new_code.x" = "new_code")) %>%
  rename(new_label.x = new_label)

# Calculate the maximum x-position from the data for plot adjustments
youth.nl.max.x.position <- max(as.numeric(youth.nl.long.combined$new_code.y))

# Set an offset for the average correlation labels (to avoid overlap)
youth.nl.offset.x <- 1.25  # A smaller offset to avoid pushing labels too far out

# Prepare the labels for average correlation, adjusting the position to the right of the plot
youth.nl.avg.corr.labels <- youth.nl.avg.corr.labels %>%
  mutate(x_position = youth.nl.max.x.position + youth.nl.offset.x)

# Merge the average correlation data with labels to ensure proper matching
youth.nl.avg.corr.labels <- left_join(youth.nl.avg.corr.labels, labels.df, by = "code")
youth.nl.avg.corr.labels <- youth.nl.avg.corr.labels %>%
  rename(new_label.y = new_label)

# Reorder the labels according to the ordered names
youth.nl.ordered.names.df <- data.frame(code = youth.nl.ordered.names)
youth.nl.merged.data <- merge(youth.nl.ordered.names.df, labels.df, by = "code")
youth.nl.labeled.vector <- youth.nl.merged.data$new_label
youth.nl.ordered.names <- youth.nl.merged.data$new_code

# Update the factor levels of the columns based on ordered names
youth.nl.long.combined$new_code.x <- factor(youth.nl.long.combined$new_code.x, levels = youth.nl.ordered.names)
youth.nl.long.combined$new_label.y <- factor(youth.nl.long.combined$new_label.y, levels = youth.nl.labeled.vector)

# Filter and prepare data for the citizens matrix (Matrix == "(a) Citizens")
youth.nl.avg.corr.labels_citizens <- youth.nl.avg.corr.labels[youth.nl.avg.corr.labels$Matrix == "(a) Citizens", ]

# Order the rows in the citizens matrix by average correlation values
youth.nl.avg.corr.labels_citizens$new_code.x <- factor(youth.nl.avg.corr.labels_citizens$new_code.x, 
                                              levels = youth.nl.avg.corr.labels_citizens$new_code.x[order(youth.nl.avg.corr.labels_citizens$avg_corr)])
youth.nl.new.order <- levels(youth.nl.avg.corr.labels_citizens$new_code.x)

# Reorder the factors based on the new order for the plot
youth.nl.ordered.names.df <- data.frame(new_code = youth.nl.new.order)
youth.nl.merged.data <- left_join(youth.nl.ordered.names.df, labels.df, by = "new_code")
youth.nl.labeled.vector <- youth.nl.merged.data$new_label
youth.nl.ordered.names <- youth.nl.ordered.names.df$new_code

# Update the levels of the factor for plotting
youth.nl.long.combined$new_code.x <- factor(youth.nl.long.combined$new_code.x, levels = youth.nl.ordered.names)
youth.nl.long.combined$new_label.y <- factor(youth.nl.long.combined$new_label.y, levels = youth.nl.labeled.vector)

# Final plot creation using ggplot2
png("figs/youth_nl_combined_g.png", units="in", width=17, height=9, res=1200)

# Create the ggplot
ggplot(youth.nl.long.combined, aes(new_code.x, new_label.y, fill = value)) +  # Define the basic aesthetics (x and y variables, fill color)
  
  # Add geom_tile() for creating heatmap-like tiles
  geom_tile() +
  
  # Set the color scale with gradient from white to black, and adjust limits and breaks for the scale
  scale_fill_gradient(
    low = "white",  # Color for low values
    high = "black",  # Color for high values
    limits = c(0, max(abs(youth.nl.long.combined$value))),  # Set the limits of the color scale to cover all values
    breaks = seq(0, max(abs(youth.nl.long.combined$value)), by = 0.2)  # Specify breaks for color scale
  ) + 
  
  # Use a minimal theme for a clean background
  theme_minimal() +
  
  # Facet the plot into multiple panels based on the "Matrix" variable
  facet_wrap(~ Matrix) + 
  
  # Rotate the x-axis labels for better visibility (90-degree rotation)
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  
  # Add the plot title and caption with dynamic text
  labs(title = "Comparison of Two Correlation Matrices",
       x = "Variables", y = "Variables",
       caption = paste("Number of online sessions:", comma(n.stemtest.youth.nl.df),
                       "\nMean absolute correlation coefficient: (a)", sprintf("%.2f", youth.nl.mean.abs.corr), 
                       "(b)", sprintf("%.2f", youth.nl.mean.abs.corr.p),
                       "\nPercentage of correlations ≥ 0.2: (a)", sprintf("%.2f%%", youth.nl.percentage.above.0.2), 
                       "(b)", sprintf("%.2f%%", youth.nl.percentage.above.0.2.p),
                       "\nPercentage of correlations ≥ 0.4: (a)", sprintf("%.2f%%", youth.nl.percentage.above.0.4), 
                       "(b)", sprintf("%.2f%%", youth.nl.percentage.above.0.4.p), 
                       "\nCorrelation of correlations:", youth.nl.corr.val)) +
  
  # Customize text sizes, remove titles, and format the plot to improve readability
  theme(
    axis.text.x = element_text(size = 10.5, angle = 90, vjust = 0.5),  # Rotate x-axis labels and adjust size
    axis.text.y = element_text(size = 10.5),  # Adjust size of y-axis labels
    axis.title.x = element_blank(),  # Remove x-axis title
    axis.title.y = element_blank(),  # Remove y-axis title
    legend.title = element_blank(),  # Remove legend title
    plot.title = element_blank(),    # Remove plot title
    panel.grid.major = element_blank(),  # Remove major grid lines
    panel.grid.minor = element_blank(),  # Remove minor grid lines
    plot.caption = element_text(size = 12),  # Adjust caption size
    strip.text = element_text(size = 16),  # Adjust size of facet labels
    panel.spacing.x = unit(.3, "cm"),  # Set space between panels
    legend.position = "bottom",  # Position the legend at the bottom
    legend.direction = "horizontal",  # Make the legend horizontal
    legend.key.height = unit(0.4, "cm"),  # Adjust the height of the legend keys
    legend.key.width = unit(2, "cm"),  # Adjust the width of the legend keys
    legend.text = element_text(size = 11),  # Adjust legend text size
    plot.margin = margin(0, 0, 0, 0, "cm")  # Set plot margins to zero for a tighter fit
  ) +
  
  # Customize x-axis labels using ggtext for HTML formatting to add color
  scale_x_discrete(labels = function(x) {
    x <- gsub("^ECONOMY(?=\\.)", "<span style='color:#800000;'><b>ECONOMY</b></span>", x, perl = TRUE)  # Add color for ECFL
    x <- gsub("^EDUCATION(?=\\.)", "<span style='color:#9A6324;'><b>EDUCATION</b></span>", x, perl = TRUE)  # Add color for EDUC
    x <- gsub("^ENVIRONMENT(?=\\.)", "<span style='color:#3cb44b;'><b>ENVIRONMENT</b></span>", x, perl = TRUE)  # Add color for ENEG
    x <- gsub("^ETHICS(?=\\.)", "<span style='color:#f58231;'><b>ETHICS</b></span>", x, perl = TRUE)  # Add color for ETHE
    x <- gsub("^FOREIGN(?=\\.)", "<span style='color:#ffe119;'><b>FOREIGN</b></span>", x, perl = TRUE)  # Add color for FAED
    x <- gsub("^WELFARE(?=\\.)", "<span style='color:#bfef45;'><b>WELFARE</b></span>", x, perl = TRUE)  # Add color for HSWF
    x <- gsub("^IMMIGRATION(?=\\.)", "<span style='color:#e6194B;'><b>IMMIGRATION</b></span>", x, perl = TRUE)  # Add color for IMIN
    x <- gsub("^CRIME(?=\\.)", "<span style='color:#42d4f4;'><b>CRIME</b></span>", x, perl = TRUE)  # Add color for JLEF
    x <- gsub("^CULTURE(?=\\.)", "<span style='color:#f032e6;'><b>CULTURE</b></span>", x, perl = TRUE)  # Add color for MECU
    x <- gsub("^MOBILITY(?=\\.)", "<span style='color:#4363d8;'><b>MOBILITY</b></span>", x, perl = TRUE)  # Add color for MPTR
    x <- gsub("^HOUSING(?=\\.)", "<span style='color:#000075;'><b>HOUSING</b></span>", x, perl = TRUE)  # Add color for SPHO
    x <- gsub("^STATE(?=\\.)", "<span style='color:#911eb4;'><b>STATE</b></span>", x, perl = TRUE)  # Add color for SRPI
    return(x)
  }) +
  
  # Similarly, customize the y-axis labels with color formatting
  scale_y_discrete(labels = function(y) {
    y <- gsub("ECONOMY(?=\\.)", "<span style='color:#800000;'><b>ECONOMY</b></span>", y, perl = TRUE)  # Add color for ECFL
    y <- gsub("EDUCATION(?=\\.)", "<span style='color:#9A6324;'><b>EDUCATION</b></span>", y, perl = TRUE)  # Add color for EDUC
    y <- gsub("ENVIRONMENT(?=\\.)", "<span style='color:#3cb44b;'><b>ENVIRONMENT</b></span>", y, perl = TRUE)  # Add color for ENEG
    y <- gsub("ETHICS(?=\\.)", "<span style='color:#f58231;'><b>ETHICS</b></span>", y, perl = TRUE)  # Add color for ETHE
    y <- gsub("FOREIGN(?=\\.)", "<span style='color:#ffe119;'><b>FOREIGN</b></span>", y, perl = TRUE)  # Add color for FAED
    y <- gsub("WELFARE(?=\\.)", "<span style='color:#bfef45;'><b>WELFARE</b></span>", y, perl = TRUE)  # Add color for HSWF
    y <- gsub("IMMIGRATION(?=\\.)", "<span style='color:#e6194B;'><b>IMMIGRATION</b></span>", y, perl = TRUE)  # Add color for IMIN
    y <- gsub("CRIME(?=\\.)", "<span style='color:#42d4f4;'><b>CRIME</b></span>", y, perl = TRUE)  # Add color for JLEF
    y <- gsub("CULTURE(?=\\.)", "<span style='color:#f032e6;'><b>CULTURE</b></span>", y, perl = TRUE)  # Add color for MECU
    y <- gsub("MOBILITY(?=\\.)", "<span style='color:#4363d8;'><b>MOBILITY</b></span>", y, perl = TRUE)  # Add color for MPTR
    y <- gsub("HOUSING(?=\\.)", "<span style='color:#000075;'><b>HOUSING</b></span>", y, perl = TRUE)  # Add color for SPHO
    y <- gsub("STATE(?=\\.)", "<span style='color:#911eb4;'><b>STATE</b></span>", y, perl = TRUE)  # Add color for SRPI
    return(y)
  }) +
  
  # Enable Markdown formatting for axis labels (to support HTML formatting)
  theme(axis.text.x = element_markdown(),
        axis.text.y = element_markdown()) +  
  
  # Add average absolute correlation labels at the end of each row, only for certain matrices
  geom_text(data = youth.nl.avg.corr.labels %>%
              filter(Matrix == "(a) Citizens" | Matrix == "(b) Party leaders"),  # Filter data for the labels
            aes(x = x_position, y = new_label.y, label = sprintf("%.2f", avg_corr)),
            inherit.aes = FALSE, size = 4, hjust = 0, vjust = 0.5) +  # Adjust label positioning
  
  # Expand x-axis to accommodate labels just outside the plot area
  coord_cartesian(xlim = c(NA, youth.nl.max.x.position + youth.nl.offset.x + 2))  # Set x-axis limit

# Save the plot to a file
dev.off()

################################################################################
## MEAN ABSOLUTE CORRELATION BY PARTY SUGGESTION
################################################################################

# Custom color palette for political party labels
party_palette <- c("NVA" = "#FCBD1B", "VB" = "black", "CDV" = "#FF6000", 
                   "PVDA" = "#EF4135", "VLD" = "#005DAA",
                   "VOORUIT" = "#FF2900", "GROEN" = "#02AD5B", "PS" = "#FF0000", 
                   "MR" = "#002EFF", "Ecolo" = "#088C39", "LesEngages" = "#00E6D2", 
                   "Defi" = "#DD007A")

# Define party logo image labels using HTML <img> tags
party.img.labels <- c(
  "CDV" = "<img src='logos/CDV.png' width='40' />",      # Christian Democratic Party (CDV) logo
  "Defi" = "<img src='logos/Defi.png' width='40' />",     # Defi party logo
  "Ecolo" = "<img src='logos/Ecolo.png' width='40' />",   # Ecolo party logo
  "GROEN" = "<img src='logos/GROEN.png' width='40' />",   # Groen party logo
  "LesEngages" = "<img src='logos/LesEngages.png' width='43' />", # Les Engagés party logo
  "MR" = "<img src='logos/MR.png' width='38' />",         # Mouvement Réformateur (MR) logo
  "NVA" = "<img src='logos/NVA.png' width='48' />",       # New Flemish Alliance (NVA) logo
  "VLD" = "<img src='logos/VLD.png' width='48' />",       # Open Flemish Liberals and Democrats (VLD) logo
  "PS" = "<img src='logos/PS.png' width='30' />",         # Socialist Party (PS) logo
  "PVDA" = "<img src='logos/PVDA.png' width='48' />",     # Workers' Party of Belgium (PVDA) logo
  "PTB" = "<img src='logos/PTB.png' width='48' />",           # Workers' Party of Belgium (PTB) logo
  "VB" = "<img src='logos/VB.png' width='48' />",         # Vlaams Belang (VB) logo
  "VOORUIT" = "<img src='logos/VOORUIT.png' width='53' />") # Vooruit party logo

# Add a new column 'image' to the data frame, containing the party logo image paths based on the party 'advice' column
stemtest.youth.nl.avg.byparty$image <- paste0("logos/", stemtest.youth.nl.avg.byparty$advice, ".png")

# Save the plot as a high-resolution PNG image
png("figs/stemtest_youth_nl_byparty.png", units="in", width=6, height=5, res=1200)

# Create a horizontal bar plot using ggplot2
ggplot(stemtest.youth.nl.avg.byparty, aes(x = avg_abs_tetra, y = reorder(advice, avg_abs_tetra), fill = advice)) +
  geom_col() +  # Create bars for each party with the average correlation value
  geom_vline(xintercept = youth.nl.mean.abs.corr, linetype = "dashed", color = "black", linewidth = 0.4) +  # Add a dashed vertical line for the mean correlation
  # Replace y-axis labels with party logos (using the 'party.img.labels' vector defined earlier)
  scale_y_discrete(labels = party.img.labels) + 
  # Apply custom color palette to fill bars based on party
  scale_fill_manual(values = party_palette) + 
  # Remove the legend as it's unnecessary for this plot
  guides(fill = "none") + 
  # Add numeric labels to each bar, formatted to two decimal places
  geom_text(aes(label = sprintf("%.2f", avg_abs_tetra)),  # Format labels to 2 decimal places
            position = position_stack(vjust = .92),  # Position labels within the bars
            size = 3, color = "white") +  # Set label text size and color
  theme_minimal() +  # Use a minimal theme for a clean look
  theme(
    axis.text.y = element_markdown(lineheight = 1.2),  # Render HTML content (party logos) on the y-axis
    axis.text.x = element_text(size = 10),  # Set smaller font size for x-axis labels
    axis.ticks.y = element_blank(),  # Remove the y-axis ticks
    axis.title.x = element_text(size = 10, margin = margin(t = 10)),  # Reduce font size of x-axis title
    plot.margin = margin(0, 1, 0, 3)  # Increase space on the left for party logos to fit
  ) +
  labs(
    x = "Mean absolute correlation",  # Set x-axis title
    y = NULL,  # No y-axis title
    title = NULL  # Remove default plot title
  )

# Close the PNG device to save the plot
dev.off()

################################################################################
## CONFIRMATORY FACTOR ANALYSIS
################################################################################

# Ensure that `labels_df$code` contains only the columns that exist in `stemtest.youth.nl.df`
# This step filters out any codes in `labels_df` that do not have corresponding columns in the data frame
labels_df <- labels.df %>% filter(code %in% colnames(stemtest.youth.nl.df))

# Create a named vector `rename_map` that maps each old column name to a new name
# The new names combine the old code and corresponding topic (e.g., "code_topic")
rename_map <- setNames(paste0(labels_df$code, "_", labels_df$topic), labels_df$code)

# Rename the columns of `stemtest.youth.nl.df` according to `rename_map`
# This ensures that column names are more informative by combining the code and the topic
stemtest.youth.nl.df.cfa <- stemtest.youth.nl.df %>%
  rename_with(~ rename_map[.x], .cols = all_of(labels_df$code))

# Remove NA values
stemtest.youth.nl.df.cfa <- na.omit(stemtest.youth.nl.df.cfa)

## SPECIFY MODELS
################################################################################

# (A) One-factor model: All items load onto a single "ideology" factor

# Extract the column names from the data frame (excluding "_Other" items)
stemtest.youth.nl.items <- colnames(stemtest.youth.nl.df.cfa)
stemtest.youth.nl.items <- stemtest.youth.nl.items[!grepl("_Other", stemtest.youth.nl.items)]

# Define the CFA model for the one-factor model: all variables are indicators of a single latent factor "ideology"
stemtest.youth.nl.cfa.model.1f <- paste0("ideology =~ ", paste(stemtest.youth.nl.items, collapse = " + "))

# (B) Two-factor model: Items are split into two factors: Economic vs Socio-cultural

# Identify variables related to economic and socio-cultural factors based on their suffix
stemtest.youth.nl.economic.vars <- grep("_Economic$", names(stemtest.youth.nl.df.cfa), value = TRUE)
stemtest.youth.nl.cultural.vars <- grep("_Cultural$", names(stemtest.youth.nl.df.cfa), value = TRUE)

# Remove the suffix "_Economic" from the economic variable names for better readability
stemtest.youth.nl.economic.vars <- gsub("_Economic$", "", unlist(stemtest.youth.nl.economic.vars))

# Remove the suffix "_Cultural" from the cultural variable names for better readability
stemtest.youth.nl.cultural.vars <- gsub("_Cultural$", "", unlist(stemtest.youth.nl.cultural.vars))

# Define the CFA model for the two-factor model: 
# Economic items load on the "Economic" factor and Cultural items load on the "Cultural" factor
stemtest.youth.nl.cfa.model.2f <- paste(
  "Economic =~", paste(stemtest.youth.nl.economic.vars, collapse = " + "), "\n",
  "Cultural =~", paste(stemtest.youth.nl.cultural.vars, collapse = " + ")
)

## RUN CFA
################################################################################

# Fit the one-factor model using the WLSMV estimator (appropriate for binary data)
stemtest.youth.nl.cfa.fit.1f <- cfa(stemtest.youth.nl.cfa.model.1f, 
                                      data = stemtest.youth.nl.df.cfa, 
                                      ordered = colnames(stemtest.youth.nl.df.cfa), 
                                      estimator = "WLSMV")

# Clean up the column names of the data by removing everything after the first "_"
colnames(stemtest.youth.nl.df.cfa) <- sapply(strsplit(colnames(stemtest.youth.nl.df.cfa), "_"), `[`, 1)

# Fit the two-factor model (no estimator specified here as it's already default for binary data)
stemtest.youth.nl.cfa.fit.2f <- cfa(stemtest.youth.nl.cfa.model.2f, 
                                      data = stemtest.youth.nl.df.cfa, 
                                      ordered = colnames(stemtest.youth.nl.df.cfa))

## DIAGNOSTICS AND MODEL COMPARISON
################################################################################

# Inspect standardized factor loadings to check if they are sufficiently strong (typically > 0.5)
parameterEstimates(stemtest.youth.nl.cfa.fit.1f, standardized = TRUE) %>%
  filter(op == "=~") # Extract factor loadings for the one-factor model
parameterEstimates(stemtest.youth.nl.cfa.fit.2f, standardized = TRUE) %>%
  filter(op == "=~") # Extract factor loadings for the two-factor model

# Check overall model fit indices for both models (values close to 1 indicate good fit)
summary(stemtest.youth.nl.cfa.fit.1f, fit.measures = TRUE, standardized = TRUE)
summary(stemtest.youth.nl.cfa.fit.2f, fit.measures = TRUE, standardized = TRUE)

# Extract fit statistics
stemtest.youth.nl.cfa.fit.stats.f1 <- fitMeasures(stemtest.youth.nl.cfa.fit.1f, c("chisq", "df", "cfi", "tli", "rmsea", "srmr"))
stemtest.youth.nl.cfa.fit.stats.f2 <- fitMeasures(stemtest.youth.nl.cfa.fit.2f, c("chisq", "df", "cfi", "tli", "rmsea", "srmr"))

# Create a data frame with selected fit statistics
youth.nl.fit.table <- data.frame(
  Model = c("One-Factor", "Two-Factor"),
  RMSEA = c(stemtest.youth.nl.cfa.fit.stats.f1["rmsea"], stemtest.youth.nl.cfa.fit.stats.f2["rmsea"]),
  CFI   = c(stemtest.youth.nl.cfa.fit.stats.f1["cfi"], stemtest.youth.nl.cfa.fit.stats.f2["cfi"]),
  TLI   = c(stemtest.youth.nl.cfa.fit.stats.f1["tli"], stemtest.youth.nl.cfa.fit.stats.f2["tli"]),
  SRMR  = c(stemtest.youth.nl.cfa.fit.stats.f1["srmr"], stemtest.youth.nl.cfa.fit.stats.f2["srmr"])
)

# Round values for readability
youth.nl.fit.table[, 2:5] <- round(youth.nl.fit.table [, 2:5], 3)

# Run chi-square difference test
youth.nl.lrt.result <- lavTestLRT(stemtest.youth.nl.cfa.fit.1f, stemtest.youth.nl.cfa.fit.2f)
youth.nl.lrt.table <- as.data.frame(youth.nl.lrt.result)

# Generate LaTeX table for fit indices
youth.nl.fit.xtable <- xtable(youth.nl.fit.table, caption = "Fit Statistics for One-Factor and Two-Factor Models")
print(youth.nl.fit.xtable, include.rownames = FALSE, caption.placement = "top", booktabs = TRUE)

# Generate LaTeX table for chi-square difference test
youth.nl.lrt.xtable <- xtable(youth.nl.lrt.table, caption = "Chi-Square Difference Test Between Models")
print(youth.nl.lrt.xtable, include.rownames = FALSE, caption.placement = "top", booktabs = TRUE)

## INTERPRETATION
################################################################################

# Retrieve standardized factor loadings for the two-factor model
inspect(stemtest.youth.nl.cfa.fit.2f, what = "std")$lambda

# Check the internal consistency (reliability) of the economic and cultural factors using Cronbach's alpha
psych::alpha(stemtest.youth.nl.df.cfa[, stemtest.youth.nl.economic.vars])  # Reliability for Economic Factor
psych::alpha(stemtest.youth.nl.df.cfa[, stemtest.youth.nl.cultural.vars])  # Reliability for Cultural Factor

# Save the SEM plot of the two-factor model to a png file
png("figs/youth_nl_cfa_sem.png", units="in", width=25, height=15, res=300)

# Plot the two-factor model with standardized factor loadings displayed on the paths
semPaths(
  stemtest.youth.nl.cfa.fit.2f,
  what = "path", 
  whatLabels = "std",        # Display standardized estimates on the paths
  style = "ram",             # Use RAM-style diagram for better clarity
  layout = "spring",         # Force-directed layout for better separation of nodes
  fixedStyle = "red",        # Color fixed paths in red
  freeStyle = "black",       # Color free paths in black
  edge.width = 0.4,          # Set edge width for better visibility
  border.width = 0.2,        # Set border width of nodes
  border.color = "black",    # Set border color of nodes
  asize = 1,                 # Set arrow size for paths
  edge.label.bg = TRUE,      # Add background to edge labels for readability
  edge.label.cex = 0.5,      # Increase edge label size for readability
  edge.label.position = 0.7, # Position edge labels at the center of edges
  sizeMan = 4,               # Set the size of observed variable boxes
  sizeLat = 5,               # Set the size of latent variable ovals for emphasis
  color = list(lat = "lightgrey", man = "transparent"),  # Color scheme for variables
  title = FALSE,             # Remove default title from the plot
  intercepts = FALSE,        # Exclude intercepts from the plot
  thresholds = FALSE         # Exclude thresholds from the plot
)

# Add model fit statistics as text annotations on the plot
x_pos <- 0.90
y_pos <- 0.95

text(x = x_pos, y = y_pos, labels = paste("RMSEA:", round(stemtest.youth.nl.cfa.rmsea, 3), 
                                          "\nCFI:", round(stemtest.youth.nl.cfa.cfi, 3), 
                                          "\nTLI:", round(stemtest.youth.nl.cfa.tli, 3)),
     cex = 1.5, col = "black")

dev.off()

# Extract factor loadings from the two-factor model and clean up the 'rhs' variable for clarity
stemtest.youth.nl.cfa.f2.loadings <- parameterEstimates(stemtest.youth.nl.cfa.fit.2f, standardized = TRUE) %>%
  filter(op == "=~")

# Modify the 'rhs' column to keep only the part before the "_"
stemtest.youth.nl.cfa.f2.loadings$rhs <- sub("_.*", "", stemtest.youth.nl.cfa.f2.loadings$rhs)

# Merge the factor loadings with the corresponding labels from `labels_df`
stemtest.youth.nl.cfa.f2.loadings <- merge(stemtest.youth.nl.cfa.f2.loadings, labels_df, by.x = "rhs", by.y = "code")

# Reorder the 'label' variable within each latent factor based on the absolute value of the factor loading
stemtest.youth.nl.cfa.f2.loadings <- stemtest.youth.nl.cfa.f2.loadings %>%
  group_by(lhs) %>%
  mutate(label = factor(label, levels = label[order(abs(est))])) %>%
  ungroup()

# Save the factor loadings plot as a PNG file
png("figs/youth_nl_cfa_f2_loadings.png", units="in", width=7, height=6, res=1200)

# Create a bar plot of the factor loadings for each variable within each factor
ggplot(stemtest.youth.nl.cfa.f2.loadings, aes(x = est, y = label, fill = lhs)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("#ff7f0e", "#1f77b4")) +  # Customize bar colors
  #scale_x_continuous(limits = c(-4, 4), breaks = seq(-4, 4, by = 2)) +  # Set x-axis range
  labs(
    x = "Loading strength",  # x-axis label for the plot
    y = NULL,  # Remove y-axis label
    fill = "Latent variable"  # Legend title
  ) +
  theme_minimal() +
  theme(
    axis.title.x = element_text(size = 10, margin = margin(t = 8)),  # Customize x-axis title appearance
    legend.title = element_text(size = 9),
    legend.position = "bottom",  # Place legend at the bottom
    legend.key.width = unit(0.5, "cm"),  # Adjust legend key width
    legend.key.height = unit(0.5, "cm")  # Adjust legend key height
  )

dev.off()

################################################################################
## EXPLORATORY FACTOR ANALYSIS
################################################################################

youth.nl.tetrachoric.matrix <- tetrachoric(stemtest.youth.nl.df)$rho
#youth.nl.tetrachoric.matrix <- hetcor(stemtest.youth.nl.df)$cor

# Kaiser-Meyer-Olkin criterion: tests the suitability of the data for factor analysis
KMO(youth.nl.tetrachoric.matrix)

# Bartlett's test of sphericity: tests whether the correlation matrix is an identity matrix, indicating if factor analysis is appropriate
BARTLETT(stemtest.youth.nl.df)

# Eigenvalue method (“Kaiser’s rule”): calculates eigenvalues to determine the number of factors to retain
youth.nl.ev <- eigen(youth.nl.tetrachoric.matrix) # get eigenvalues
#youth.nl.ev <- eigen(cor(stemtest.youth.nl.df, use = 'pairwise.complete.obs')) # get eigenvalues
youth.nl.ev$values  # returns the eigenvalues for each factor

# Scree plot: visualizes the eigenvalues to help determine the number of factors to retain
scree(youth.nl.tetrachoric.matrix, pc=FALSE)  # Use pc=FALSE for factor analysis, without Principal Components

# Parallel analysis scree plot: compares observed eigenvalues with simulated eigenvalues to decide on the number of factors
set.seed(123)  # Set random seed for reproducibility
youth.nl.parallel <- fa.parallel(youth.nl.tetrachoric.matrix, fa='fa')  # Parallel analysis
youth.nl.parallel  # Output the results of parallel analysis
youth.nl.parallel$nfact  # Suggested number of factors

# Create data frame from observed eigenvalue data for plotting
youth.nl.obs <- data.frame(youth.nl.parallel$fa.values)
youth.nl.obs$type <- c('Observed Data')  # Label the data as observed
youth.nl.obs$num <- c(row.names(youth.nl.obs))  # Add factor number as column
youth.nl.obs$num <- as.numeric(youth.nl.obs$num)  # Convert factor number to numeric
colnames(youth.nl.obs) <- c('eigenvalue', 'type', 'num')  # Set column names

# Calculate quantiles for eigenvalues from simulated data, storing the 95th percentile values
percentile <- apply(youth.nl.parallel$values,2,function(x) quantile(x,.95))
youth.nl.min <- as.numeric(nrow(youth.nl.obs))  # Minimum value for selecting the range
youth.nl.min <- (4*youth.nl.min) - (youth.nl.min-1)  # Adjust minimum index
youth.nl.max <- as.numeric(nrow(youth.nl.obs))  # Maximum value for selecting the range
youth.nl.max <- 4*youth.nl.max  # Adjust maximum index
youth.nl.percentile1 <- percentile[youth.nl.min:youth.nl.max]  # Select the 95th percentile eigenvalues

# Create a data frame for the simulated eigenvalue data
youth.nl.sim <- data.frame(youth.nl.percentile1)
youth.nl.sim$type <- c('Simulated Data (95th %ile)')  # Label the data as simulated
youth.nl.sim$num <- c(row.names(youth.nl.obs))  # Add factor number as column
youth.nl.sim$num <- as.numeric(youth.nl.sim$num)  # Convert factor number to numeric
colnames(youth.nl.sim) <- c('eigenvalue', 'type', 'num')  # Set column names

# Merge the observed and simulated eigenvalue data frames
youth.nl.eigendat <- rbind(youth.nl.obs,youth.nl.sim)

# Define a custom theme for the plot
apatheme = theme_bw() + 
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_blank(),
        text = element_text(family = 'Arial'),
        legend.title = element_blank(),
        legend.position = c(.7, .8),
        axis.line.x = element_line(color = 'black'),
        axis.line.y = element_line(color = 'black'))

# Save the scree plot as a png file
png("figs/youth_nl_scree_g.png", units="in", width=8, height=6, res=1200)

# Create the scree plot using ggplot2
youth.nl.scree <- ggplot(youth.nl.eigendat, aes(x = num, y = eigenvalue, shape = type)) +
  geom_line() +  # Add lines connecting data points
  geom_point(size = 2.5) +  # Add points for each eigenvalue
  scale_y_continuous(name = 'Eigenvalue', limits = c(-.6, 5), breaks = seq(0, 5, 1)) +  # Y-axis labels and limits
  scale_x_continuous(name = 'Factor Number', breaks = min(youth.nl.eigendat$num):max(youth.nl.eigendat$num)) +  # X-axis labels and limits
  scale_shape_manual(values = c(16, 1)) +  # Define different shapes for observed and simulated data
  geom_vline(xintercept = youth.nl.parallel$nfact, linetype = 'dashed') +  # Add vertical line for suggested number of factors
  theme_bw() +  # Apply custom theme
  theme(axis.title.x = element_text(size = 12, margin = margin(t = 18, r = 0, b = 0, l = 0)),
        axis.title.y = element_text(size = 12, margin = margin(t = 0, r = 18, b = 0, l = 0)),
        axis.text.x = element_text(size = 12, color = "black", margin = margin(t = 8, r = 0, b = 0, l = 0)),
        axis.text.y = element_text(size = 12, color = "black", margin = margin(t = 0, r = 8, b = 0, l = 0)),
        legend.position = c(.82, .92),
        legend.title = element_blank(),
        legend.text = element_text(size = 10),
        legend.background = element_blank(),
        legend.key = element_rect(colour = NA, fill = NA),
        axis.ticks.length = unit(.18, "cm"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

# Call the plot to render it
youth.nl.scree

# Close the file device to save the plot
dev.off()

# Very Simple Structure (VSS) criterion: assesses the simplicity of the factor structure
youth.nl.vss <- VSS(youth.nl.tetrachoric.matrix)   

# Perform VSS with oblimin rotation
youth.nl.vss.r <- VSS(youth.nl.tetrachoric.matrix, rotate = "oblimin")
youth.nl.vss.r

# Create a scree plot for VSS
VSS.scree(youth.nl.tetrachoric.matrix, main = "scree plot")
#VSS.scree(cor(stemtest.youth.nl.df, use = 'pairwise.complete.obs'), main = "scree plot")

# Save the VSS plot as a png file
png("figs/youth_nl_vss_g.png", units = "in", width = 8, height = 6, res = 300)

# Run and display the results of VSS
youth.nl.vss <- VSS(youth.nl.tetrachoric.matrix)   
youth.nl.vss

# Close the file device for VSS plot
dev.off()

# Velicer's minimum average partial (MAP) test: helps determine the number of factors to retain by minimizing residual variance
MAP(stemtest.youth.nl.df, corkind = 'polychoric', verbose = TRUE)

# Testing different factor models with up to 4 factors
#youth.nl.nf <- nfactors(stemtest.youth.nl.df, n = 4, fm = "ml", cor = "tet")
#youth.nl.nf 

# Extract and rotate factors (4 factors in this case)
youth.nl.Nfacs <- 4

# Perform factor analysis using oblimin/varimax rotation
#cor_fa <- cor(stemtest.youth.nl.df, use = 'pairwise.complete.obs')
youth.nl.fit.obl <- factanal(covmat = youth.nl.tetrachoric.matrix, factors = youth.nl.Nfacs, rotation = "oblimin") # oblimin rotation 
youth.nl.fit.var <- factanal(covmat = youth.nl.tetrachoric.matrix, factors = youth.nl.Nfacs, rotation = "varimax") # varimax rotation 

# Print the factor analysis results, showing loadings greater than 0.3
print(youth.nl.fit.obl, digits = 5, cutoff = 0.3, sort = TRUE)

# Extract the factor loadings
youth.nl.efa.loads <- youth.nl.fit.obl$loadings

# Perform factor analysis using oblimin/varimax rotation again using fa
youth.nl.fit.obl.fa <- psych::fa(youth.nl.tetrachoric.matrix, nfactors = youth.nl.Nfacs, rotate = "oblimin") # oblimin rotation 

# Extract the factor loadings
youth.nl.efa.loads.fa <- youth.nl.fit.obl.fa$loadings

# Calculate the proportion of variance explained by each factor
# Sum of squared loadings for each factor (this represents the explained variance)
youth.nl.efa.ss <- apply(youth.nl.efa.loads.fa^2, 2, sum)

# The total variance is 1 for standardized data, so the proportion explained by each factor
youth.nl.efa.var <- youth.nl.efa.ss/length(youth.nl.fit.obl.fa$communality)

# Create a dataframe with the proportion variance for each factor
youth.nl.efa.var.df <- data.frame(Factor = paste0("F", 1:youth.nl.Nfacs), 
                                    Proportion_Variance = youth.nl.efa.var)

# Write 'dataframe' dataframe to Excel
write_xlsx(youth.nl.efa.var.df, "files/youth_nl_efa_var_df.xlsx")

# Extract the inter-factor correlation matrix (Phi)
youth.nl.efa.inter <- youth.nl.fit.obl.fa$Phi

# Create a dataframe from the inter-factor correlation matrix
youth.nl.efa.inter.df <- data.frame(youth.nl.efa.inter)

# Move row names (which represent factors) into a new column called 'Factor'
youth.nl.efa.inter.df$Factor <- rownames(youth.nl.efa.inter.df)

# Replace "MR" with "F" in the Factor column (e.g., MR1 → F1)
youth.nl.efa.inter.df$Factor <- gsub("MR", "F", youth.nl.efa.inter.df$Factor)

# Also clean column names by removing ".MR" to keep only factor names (e.g., F.MR1 → F1)
colnames(youth.nl.efa.inter.df) <- gsub("\\.MR", "", colnames(youth.nl.efa.inter.df))

# Reshape the dataframe from wide to long format for pairwise correlations
youth.nl.efa.inter.df.long <- youth.nl.efa.inter.df %>%
  # Pivot all columns except 'Factor' into long format
  pivot_longer(cols = -Factor, names_to = "Other_Factor", values_to = "Correlation") %>%
  # Remove rows where a factor is correlated with itself (diagonal)
  filter(Factor != Other_Factor) %>%
  # Create a unique identifier for each pair (sorted alphabetically to avoid duplicates)
  mutate(pair = pmap_chr(list(Factor, Other_Factor), ~ paste(sort(c(..1, ..2)), collapse = "_"))) %>%
  # Keep only one row per unique factor pair
  distinct(pair, .keep_all = TRUE) %>%
  # Rename columns for clarity
  select(Factor1 = Factor, Factor2 = Other_Factor, Correlation)

# Ensure 'Factor2' labels are also cleaned (if still in MR format, though this may be redundant)
youth.nl.efa.inter.df.long$Factor2 <- gsub("MR", "F", youth.nl.efa.inter.df.long$Factor2)

# Remove rows where Factor1 and Factor2 are the same (i.e., self-correlations)
youth.nl.efa.inter.df.long <- youth.nl.efa.inter.df.long %>%
  filter(Factor1 != Factor2)

# Write 'youth.nl.efa.inter.df.long' dataframe to Excel
write_xlsx(youth.nl.efa.inter.df.long, "files/youth_nl_efa_inter_correlations.xlsx")

# Create a diagram of the factor loadings
fa.diagram(youth.nl.efa.loads)

# Transform factor loadings into a dataframe for easier analysis and manipulation
youth.nl.efa.loads.df <- as.data.frame.matrix(youth.nl.efa.loads)

# Transform row names (factor names) into a column
youth.nl.efa.loads.df <- rownames_to_column(youth.nl.efa.loads.df, var = "statements")

# Set values with absolute loadings <= 0.3 to NA for clarity
youth.nl.efa.loads.df[sapply(youth.nl.efa.loads.df, is.numeric)] <- lapply(youth.nl.efa.loads.df[sapply(youth.nl.efa.loads.df, is.numeric)], function(x) {
  x[abs(x) <= 0.3] <- NA
  return(x)
})

# Merge the factor loading data frame with another data frame containing labels for the statements
# This merges 'youth.nl.efa.loads.df' with 'labels.df' on the 'statements' column in 'youth.nl.efa.loads.df'
# and 'code' column in 'labels.df' to associate labels with factor loadings.
youth.nl.efa.loads.df <- merge(youth.nl.efa.loads.df, labels.df, by.x = "statements", by.y = "code")

# Select and organize necessary columns for further analysis and visualization
# Here we keep only the 'label' column and the four factors (Factor1 to Factor4) from the merged dataframe.
youth.nl.efa.loads.df <- youth.nl.efa.loads.df %>%
  select(label, Factor1, Factor2, Factor3, Factor4)

# Create a LaTeX table of the factor loadings for reporting purposes
# This converts 'youth.nl.efa.loads.df' into a LaTeX-compatible table with a caption and label for referencing in a report.
youth.nl.efa.loads.table <- xtable(youth.nl.efa.loads.df, caption = "Factor Loadings", label = "tab1")

# Save the LaTeX table to a .tex file
sink("files/youth_nl_loads_table.tex")  # This redirects output to the file
print(youth.nl.efa.loads.table, type = "latex", comment = FALSE, include.rownames = FALSE)  # Prints the table in LaTeX format
sink()  # Turns off the sink function, so the output returns to the console

# Add a new column 'ids' that contains the row names of the data frame (useful for identifying rows)
youth.nl.efa.loads.df$ids <- rownames(youth.nl.efa.loads.df)

# Select relevant columns, including 'ids', 'label', and the factor columns, into a new data frame
youth.nl.factor.loads <- youth.nl.efa.loads.df[, c("ids", "label", "Factor1", "Factor2", "Factor3", "Factor4")]

# Melt the data from wide format to long format, which is necessary for ggplot to visualize the data
# This converts the factor loading data from multiple columns into a single 'variable' column and a 'value' column.
youth.nl.efa.loads.df <- reshape2::melt(youth.nl.factor.loads)

# Replace the string "Factor" with "F" in the 'variable' column for cleaner labeling in the plot
youth.nl.efa.loads.df$variable <- gsub("Factor", "F", youth.nl.efa.loads.df$variable)

youth.nl.efa.loads.df$ids <- as.factor(youth.nl.efa.loads.df$ids)
youth.nl.efa.loads.df$label <- as.factor(youth.nl.efa.loads.df$label)

# Merge youth.nl.efa.loads.df with labels.df
youth.nl.efa.loads.df <- merge(youth.nl.efa.loads.df, labels.df, by = "label")

# Add ideological leaning to statement labels
youth.nl.efa.loads.df <- youth.nl.efa.loads.df %>%
  mutate(
    new_label = paste0(ideology, " ", new_label)  
    )

# Create a png file to save the heatmap

# The resolution is set to 1200 dpi for high-quality output, and the image size is specified in inches (6x6).
png("figs/youth_nl_efa_loadings.png", units="in", width=7, height=6, res=1200)

# Sort the levels of the 'new_label' based on the substring starting from the 5th character
# Ensure there are no duplicate levels by using unique() before assigning the levels
youth.nl.efa.loads.df$new_label <- factor(youth.nl.efa.loads.df$new_label, 
                                    levels = unique(youth.nl.efa.loads.df$new_label[order(substr(youth.nl.efa.loads.df$new_label, 5, nchar(youth.nl.efa.loads.df$new_label)))]))

# Create the heatmap using ggplot2
# The 'x' axis represents the factors, 'y' represents the ids (statements),
# and 'fill' represents the factor loading values.
ggplot(youth.nl.efa.loads.df, aes(x = variable, y = new_label, fill = value)) + 
  geom_tile() +  # Create a tile-based heatmap
  geom_text(aes(label = ifelse(is.na(value), "", sprintf("%.2f", value))), size=2.5) + 
  scale_fill_gradient2(midpoint = 0, low = "#FFE338", high = "#FF0000", mid = "white", 
                       na.value = "lightgrey") +  # Color scale from yellow to red, with NA values in light grey
  labs(fill = "Loading") +  # Label for the color scale legend
  theme_minimal() +  # Apply a minimal theme for the plot
  scale_y_discrete(labels = function(y) {
    # Add color formatting
    y <- gsub("ECONOMY(?=\\.)", "<span style='color:#800000;'><b>ECONOMY</b></span>", y, perl = TRUE)  # Add color for ECFL
    y <- gsub("EDUCATION(?=\\.)", "<span style='color:#9A6324;'><b>EDUCATION</b></span>", y, perl = TRUE)  # Add color for EDUC
    y <- gsub("ENVIRONMENT(?=\\.)", "<span style='color:#3cb44b;'><b>ENVIRONMENT</b></span>", y, perl = TRUE)  # Add color for ENEG
    y <- gsub("ETHICS(?=\\.)", "<span style='color:#f58231;'><b>ETHICS</b></span>", y, perl = TRUE)  # Add color for ETHE
    y <- gsub("FOREIGN(?=\\.)", "<span style='color:#ffe119;'><b>FOREIGN</b></span>", y, perl = TRUE)  # Add color for FAED
    y <- gsub("WELFARE(?=\\.)", "<span style='color:#bfef45;'><b>WELFARE</b></span>", y, perl = TRUE)  # Add color for HSWF
    y <- gsub("IMMIGRATION(?=\\.)", "<span style='color:#e6194B;'><b>IMMIGRATION</b></span>", y, perl = TRUE)  # Add color for IMIN
    y <- gsub("CRIME(?=\\.)", "<span style='color:#42d4f4;'><b>CRIME</b></span>", y, perl = TRUE)  # Add color for JLEF
    y <- gsub("CULTURE(?=\\.)", "<span style='color:#f032e6;'><b>CULTURE</b></span>", y, perl = TRUE)  # Add color for MECU
    y <- gsub("MOBILITY(?=\\.)", "<span style='color:#4363d8;'><b>MOBILITY</b></span>", y, perl = TRUE)  # Add color for MPTR
    y <- gsub("HOUSING(?=\\.)", "<span style='color:#000075;'><b>HOUSING</b></span>", y, perl = TRUE)  # Add color for SPHO
    y <- gsub("STATE(?=\\.)", "<span style='color:#911eb4;'><b>STATE</b></span>", y, perl = TRUE)  # Add color for SRPI
    return(y)
  }) +
  theme(axis.text.y = element_markdown(),
        axis.text.x = element_text(hjust = .5),  # Center the x-axis labels
        axis.title = element_blank(),  # Remove axis titles
        legend.title = element_text(size = 9, margin = margin(r = 0.3, t = -1.8, unit = "lines")),  # Customize the legend title text size and margins
        legend.position = "bottom",  # Place the legend at the bottom
        legend.direction = "horizontal",  # Set the legend items to be horizontal
        legend.key.height = unit(0.4, "cm"), # Reduce the height of the legend scale
        legend.key.width = unit(.8, "cm")) # Increase the width of the legend scale

dev.off()  # Close the png file and save the plot

# Factor analysis diagram

png("figs/youth_nl_fa_diagram_g.png", units="in", width=15, height=12, res=1200)

youth.nl.fa.result <- psych::fa(youth.nl.tetrachoric.corr$rho, nfactors = 4, rotate = "oblimin")
colnames(youth.nl.fa.result$loadings) <- c("Factor 1", "Factor 2", "Factor 3", "Factor 4")
fa.diagram(youth.nl.fa.result, simple = FALSE, cut = 0, cex = 1, l.cex = 1, digits = 2)

dev.off()

################################################################################
## ITEM RESPONSE THEORY
################################################################################

# Remove any rows with missing data from the dataset
# This ensures that the model estimation doesn't fail due to NA values
stemtest.youth.nl.df.complete <- stemtest.youth.nl.df[complete.cases(stemtest.youth.nl.df), ]

# Set seed
set.seed(123)

# Randomly sample 10,000 complete cases from the cleaned dataset
# This reduces the data size for quicker model estimation
stemtest.youth.nl.df.sample <- stemtest.youth.nl.df.complete[
  sample(nrow(stemtest.youth.nl.df.complete), 
         size = min(10000, nrow(stemtest.youth.nl.df.complete))), 
]

# Fit a unidimensional 2-Parameter Logistic (2PL) IRT model to the sample
# Assumes there is one underlying latent trait (dimension)
youth.nl.2PL.1D <- mirt(stemtest.youth.nl.df.sample, model = 1, itemtype = '2PL')

summary(youth.nl.2PL.1D)

# Fit a bidimensional 2PL IRT model to the same sample
# Assumes two latent traits or factors influence item responses
youth.nl.2PL.2D <- mirt(stemtest.youth.nl.df.sample, model = 2, itemtype = '2PL')

summary(youth.nl.2PL.2D)

# Fit a three-dimensional 2PL IRT model to the same sample
# Assumes two latent traits or factors influence item responses
youth.nl.2PL.3D <- mirt(stemtest.youth.nl.df.sample, model = 3, itemtype = '2PL')

summary(youth.nl.2PL.3D)

# Fit a four-dimensional 2PL IRT model to the same sample
# Assumes two latent traits or factors influence item responses
youth.nl.2PL.4D <- mirt(stemtest.youth.nl.df.sample, model = 4, itemtype = '2PL')

summary(youth.nl.2PL.4D)

# Compare the two models (1D vs. 2D) using a likelihood ratio test
# Helps determine whether the added complexity of a 2D model significantly improves fit
anova(youth.nl.2PL.1D, youth.nl.2PL.2D)

# Compare the two models (2D vs. 3D) using a likelihood ratio test
# Helps determine whether the added complexity of a 3D model significantly improves fit
anova(youth.nl.2PL.2D, youth.nl.2PL.3D)

# Compare the two models (3D vs. 4D) using a likelihood ratio test
# Helps determine whether the added complexity of a 4D model significantly improves fit
anova(youth.nl.2PL.3D, youth.nl.2PL.4D)

# Store IRT model summary in an object
youth.nl.2PL.4D.sum <- summary(youth.nl.2PL.4D)

# Extract the factor correlation matrix
youth.nl.2PL.4D.corr <- youth.nl.2PL.4D.sum$fcor

# Convert to dataframe
youth.nl.2PL.4D.corr.df <- as.data.frame(youth.nl.2PL.4D.corr)

# Convert to matrix if not already
youth.nl.2PL.4D.corr.mat <- as.matrix(youth.nl.2PL.4D.corr.df)

# Get the lower triangle indices (excluding diagonal)
youth.nl.2PL.4D.low.tri <- which(lower.tri(youth.nl.2PL.4D.corr.mat), arr.ind = TRUE)

# Create the long-format dataframe
youth.nl.2PL.4D.cor.pairs.df <- data.frame(
  vaa = "Youth – Dutch",
  Factor1 = rownames(youth.nl.2PL.4D.corr.mat)[youth.nl.2PL.4D.low.tri[, 1]],
  Factor2 = colnames(youth.nl.2PL.4D.corr.mat)[youth.nl.2PL.4D.low.tri[, 2]],
  Correlation = youth.nl.2PL.4D.corr.mat[youth.nl.2PL.4D.low.tri]
)

# Save to Excel file
write_xlsx(youth.nl.2PL.4D.cor.pairs.df, "files/youth_nl_irt_inter_correlations.xlsx")

# Extract the factor loadings
youth.nl.irt.loads <- summary(youth.nl.2PL.4D)$rotF

# Calculate the proportion of variance explained by each factor
# Sum of squared loadings for each factor (this represents the explained variance)
youth.nl.irt.ss <- apply(youth.nl.irt.loads^2, 2, sum)

# The total variance is 1 for standardized data, so the proportion explained by each factor
youth.nl.irt.var <- youth.nl.irt.ss/nrow(youth.nl.irt.loads)

# Create a dataframe with the proportion variance for each factor
youth.nl.irt.var.df <- data.frame(Factor = paste0("F", 1:4), 
                                     Proportion_Variance = youth.nl.irt.var)

# Write 'dataframe' dataframe to Excel
write_xlsx(youth.nl.irt.var.df, "files/youth_nl_irt_var_df.xlsx")

# Extract item parameters from the fitted multidimensional 2PL model
# Set IRTpars = TRUE to return discrimination (a) and difficulty (b) parameters
youth.nl.item.params <- coef(youth.nl.2PL.4D, IRTpars = TRUE, simplify = TRUE)

# Combine item-level parameter lists into a single data frame
# Remove the last element if it's the $Group (typically contains test-level info)
youth.nl.item.params.df <- do.call(rbind, youth.nl.item.params[-length(youth.nl.item.params)]) 
youth.nl.item.params.df <- as.data.frame(youth.nl.item.params.df)

# Add item names as a column (they're currently row names)
youth.nl.item.params.df$Item <- rownames(youth.nl.item.params.df)

# Step 1: Compute total discrimination (α)
# This is the euclidean norm of the a-vector across the four latent dimensions
# α = sqrt(a1^2 + a2^2 + a3^2 + a3^4), where a1–a4 are the item discriminations on each dimension
youth.nl.item.params.df$discrimination <- sqrt(youth.nl.item.params.df$a1^2 + youth.nl.item.params.df$a2^2 + youth.nl.item.params.df$a4^2 + youth.nl.item.params.df$a3^2)

# Step 2: Compute difficulty (b) for each item
# In multidimensional 2PL, difficulty is derived from the intercept (d) as:
# b = -d / α
# This gives a sense of how "hard" it is to agree with the item (i.e., threshold location)
youth.nl.item.params.df$b_calculated <- -youth.nl.item.params.df$d / youth.nl.item.params.df$discrimination

# Step 3: Clean the data
# Exclude any items where discrimination is missing or zero
# (This avoids divide-by-zero issues or meaningless points in the plot)
youth.nl.item.params.df.clean <- youth.nl.item.params.df[!is.na(youth.nl.item.params.df$discrimination) & youth.nl.item.params.df$discrimination > 0, ]

# Step 4: Plot item difficulty vs. discrimination
# Each point is an item; labels show the item name
# Higher discrimination = more informative item
# Higher difficulty = item requires higher latent trait level to agree with

png("figs/youth_nl_mirt_parameters.png", units = "in", width = 8, height = 6, res = 300)

ggplot(youth.nl.item.params.df.clean, aes(x = b_calculated, y = discrimination, label = rownames(youth.nl.item.params.df.clean))) +
  geom_point(color = "#0072B2", size = 3) +
  geom_text(vjust = -0.6, size = 4) +
  theme_minimal() +
  labs(
    # Removed the title
    x = "Difficulty (b)",
    y = "Discrimination (α)"
    #caption = "Computed from multidimensional 2PL IRT model"
  ) +
  theme(
    axis.title.x = element_text(size = 16, margin = margin(t = 12)),  # Top margin for x-axis title
    axis.title.y = element_text(size = 16, margin = margin(r = 12)),  # Right margin for y-axis title
    axis.text = element_text(size = 14),          # Larger axis tick labels
    plot.caption = element_text(size = 12),       # Optional: caption size
    panel.grid.minor = element_blank()
  )

dev.off()

# Extract factor loadings (discrimination parameters) from the 3-dimensional MIRT model
youth.nl.mirt.loads <- extract.mirt(youth.nl.2PL.4D, 'F')

# Convert the factor loadings matrix to a data frame for easier manipulation
youth.nl.mirt.loads.df <- as.data.frame(youth.nl.mirt.loads)

# Add item codes (original row names) as a new column called 'code'
youth.nl.mirt.loads.df$code <- rownames(youth.nl.mirt.loads.df)

# Reshape the data frame from wide to long format for plotting or analysis
# Each row now represents a loading for a specific item and dimension
youth.nl.mirt.loads.df <- reshape2::melt(youth.nl.mirt.loads.df)

# Set loadings with an absolute value ≤ 0.3 to NA
# This removes weak loadings for clarity in visualizations or further analysis
youth.nl.mirt.loads.df[sapply(youth.nl.mirt.loads.df, is.numeric)] <- lapply(
  youth.nl.mirt.loads.df[sapply(youth.nl.mirt.loads.df, is.numeric)],
  function(x) {
    x[abs(x) <= 0.3] <- NA
    return(x)
  }
)

# Merge the long-format factor loading data with an external label dataframe
# 'labels.df' must contain a 'code' column to match on
# This step adds descriptive item labels or metadata to the loadings
youth.nl.mirt.loads.df <- merge(youth.nl.mirt.loads.df, labels.df, by = "code")

# Add ideological leaning to statement labels
youth.nl.mirt.loads.df <- youth.nl.mirt.loads.df %>%
  mutate(
    new_label = paste0(ideology, " ", new_label)  
  )

# Create a png file to save the heatmap

# The resolution is set to 1200 dpi for high-quality output, and the image size is specified in inches (6x6).
png("figs/youth_nl_mirt_loadings.png", units="in", width=7, height=6, res=1200)

# Sort the levels of the 'new_label' based on the substring starting from the 5th character
# Ensure there are no duplicate levels by using unique() before assigning the levels
youth.nl.mirt.loads.df$new_label <- factor(youth.nl.mirt.loads.df$new_label, 
                                            levels = unique(youth.nl.mirt.loads.df$new_label[order(substr(youth.nl.mirt.loads.df$new_label, 5, nchar(youth.nl.mirt.loads.df$new_label)))]))

# Create the heatmap using ggplot2
# The 'x' axis represents the factors, 'y' represents the ids (statements),
# and 'fill' represents the factor loading values.
ggplot(youth.nl.mirt.loads.df, aes(x = variable, y = new_label, fill = value)) + 
  geom_tile() +  # Create a tile-based heatmap
  geom_text(aes(label = ifelse(is.na(value), "", sprintf("%.2f", value))), size=2.5) + 
  scale_fill_gradient2(midpoint = 0, low = "#FFE338", high = "#FF0000", mid = "white", 
                       na.value = "lightgrey") +  # Color scale from yellow to red, with NA values in light grey
  labs(fill = "Loading") +  # Label for the color scale legend
  theme_minimal() +  # Apply a minimal theme for the plot
  scale_y_discrete(labels = function(y) {
    # Add color formatting
    y <- gsub("ECONOMY(?=\\.)", "<span style='color:#800000;'><b>ECONOMY</b></span>", y, perl = TRUE)  # Add color for ECFL
    y <- gsub("EDUCATION(?=\\.)", "<span style='color:#9A6324;'><b>EDUCATION</b></span>", y, perl = TRUE)  # Add color for EDUC
    y <- gsub("ENVIRONMENT(?=\\.)", "<span style='color:#3cb44b;'><b>ENVIRONMENT</b></span>", y, perl = TRUE)  # Add color for ENEG
    y <- gsub("ETHICS(?=\\.)", "<span style='color:#f58231;'><b>ETHICS</b></span>", y, perl = TRUE)  # Add color for ETHE
    y <- gsub("FOREIGN(?=\\.)", "<span style='color:#ffe119;'><b>FOREIGN</b></span>", y, perl = TRUE)  # Add color for FAED
    y <- gsub("WELFARE(?=\\.)", "<span style='color:#bfef45;'><b>WELFARE</b></span>", y, perl = TRUE)  # Add color for HSWF
    y <- gsub("IMMIGRATION(?=\\.)", "<span style='color:#e6194B;'><b>IMMIGRATION</b></span>", y, perl = TRUE)  # Add color for IMIN
    y <- gsub("CRIME(?=\\.)", "<span style='color:#42d4f4;'><b>CRIME</b></span>", y, perl = TRUE)  # Add color for JLEF
    y <- gsub("CULTURE(?=\\.)", "<span style='color:#f032e6;'><b>CULTURE</b></span>", y, perl = TRUE)  # Add color for MECU
    y <- gsub("MOBILITY(?=\\.)", "<span style='color:#4363d8;'><b>MOBILITY</b></span>", y, perl = TRUE)  # Add color for MPTR
    y <- gsub("HOUSING(?=\\.)", "<span style='color:#000075;'><b>HOUSING</b></span>", y, perl = TRUE)  # Add color for SPHO
    y <- gsub("STATE(?=\\.)", "<span style='color:#911eb4;'><b>STATE</b></span>", y, perl = TRUE)  # Add color for SRPI
    return(y)
  }) +
  theme(axis.text.y = element_markdown(),
        axis.text.x = element_text(hjust = .5),  # Center the x-axis labels
        axis.title = element_blank(),  # Remove axis titles
        legend.title = element_text(size = 9, margin = margin(r = 0.3, t = -1.8, unit = "lines")),  # Customize the legend title text size and margins
        legend.position = "bottom",  # Place the legend at the bottom
        legend.direction = "horizontal",  # Set the legend items to be horizontal
        legend.key.height = unit(0.4, "cm"), # Reduce the height of the legend scale
        legend.key.width = unit(.8, "cm")) # Increase the width of the legend scale

dev.off()  # Close the png file and save the plot

# Select specific columns (code, variable, value, and new_label) from the 'youth.nl.efa.loads.df' dataframe
youth.nl.efa.loads.df <- youth.nl.efa.loads.df %>%
  dplyr::select(code, variable, value, new_label)

# Select specific columns (code, variable, value, and new_label) from the 'youth.nl.mirt.loads.df' dataframe
youth.nl.mirt.loads.df <- youth.nl.mirt.loads.df %>%
  dplyr::select(code, variable, value, new_label)

# Add a new column 'method' to the 'youth.nl.efa.loads.df' dataframe and assign the value "(a) Exploratory Factor Analysis" to it
youth.nl.efa.loads.df$method <- "(a) Exploratory Factor Analysis"

# Add a new column 'method' to the 'youth.nl.mirt.loads.df' dataframe and assign the value "(b) Item Response Theory" to it
youth.nl.mirt.loads.df$method <- "(b) Item Response Theory"

# Combine the two dataframes ('youth.nl.efa.loads.df' and 'youth.nl.mirt.loads.df') into one dataframe ('youth.nl.efa.loads.merged.df') by row-binding them
youth.nl.efa.loads.merged.df <- rbind(youth.nl.efa.loads.df, youth.nl.mirt.loads.df)

# Create a png file to save the heatmap

# The resolution is set to 1200 dpi for high-quality output, and the image size is specified in inches (6x6).
png("figs/youth_nl_loadings_combined.png", units="in", width=12, height=7, res=1200)

# Create the heatmap using ggplot2
# The 'x' axis represents the factors, 'y' represents the ids (statements),
# and 'fill' represents the factor loading values.
ggplot(youth.nl.efa.loads.merged.df, aes(x = variable, y = new_label, fill = value)) + 
  geom_tile() +  # Create a tile-based heatmap
  geom_text(aes(label = ifelse(is.na(value), "", sprintf("%.2f", value))), size=3.5) + 
  scale_fill_gradient2(midpoint = 0, low = "#FFE338", high = "#FF0000", mid = "white", 
                       na.value = "lightgrey") +  # Color scale from yellow to red, with NA values in light grey
  labs(fill = "Loading") +  # Label for the color scale legend
  theme_minimal() +  # Apply a minimal theme for the plot
  facet_wrap(~ method) +
  scale_y_discrete(labels = function(y) {
    # Add color formatting
    y <- gsub("ECONOMY(?=\\.)", "<span style='color:#800000;'><b>ECONOMY</b></span>", y, perl = TRUE)  # Add color for ECFL
    y <- gsub("EDUCATION(?=\\.)", "<span style='color:#9A6324;'><b>EDUCATION</b></span>", y, perl = TRUE)  # Add color for EDUC
    y <- gsub("ENVIRONMENT(?=\\.)", "<span style='color:#3cb44b;'><b>ENVIRONMENT</b></span>", y, perl = TRUE)  # Add color for ENEG
    y <- gsub("ETHICS(?=\\.)", "<span style='color:#f58231;'><b>ETHICS</b></span>", y, perl = TRUE)  # Add color for ETHE
    y <- gsub("FOREIGN(?=\\.)", "<span style='color:#ffe119;'><b>FOREIGN</b></span>", y, perl = TRUE)  # Add color for FAED
    y <- gsub("WELFARE(?=\\.)", "<span style='color:#bfef45;'><b>WELFARE</b></span>", y, perl = TRUE)  # Add color for HSWF
    y <- gsub("IMMIGRATION(?=\\.)", "<span style='color:#e6194B;'><b>IMMIGRATION</b></span>", y, perl = TRUE)  # Add color for IMIN
    y <- gsub("CRIME(?=\\.)", "<span style='color:#42d4f4;'><b>CRIME</b></span>", y, perl = TRUE)  # Add color for JLEF
    y <- gsub("CULTURE(?=\\.)", "<span style='color:#f032e6;'><b>CULTURE</b></span>", y, perl = TRUE)  # Add color for MECU
    y <- gsub("MOBILITY(?=\\.)", "<span style='color:#4363d8;'><b>MOBILITY</b></span>", y, perl = TRUE)  # Add color for MPTR
    y <- gsub("HOUSING(?=\\.)", "<span style='color:#000075;'><b>HOUSING</b></span>", y, perl = TRUE)  # Add color for SPHO
    y <- gsub("STATE(?=\\.)", "<span style='color:#911eb4;'><b>STATE</b></span>", y, perl = TRUE)  # Add color for SRPI
    return(y)
  }) +
  theme(axis.text.y = element_markdown(size=12),
        axis.text.x = element_text(size=12, hjust = .5),  # Center the x-axis labels
        axis.title = element_blank(),  # Remove axis titles
        legend.title = element_text(size = 12, margin = margin(r = 0.3, t = -1.8, unit = "lines")),  # Customize the legend title text size and margins
        legend.text = element_text(size = 12),
        legend.position = "bottom",  # Place the legend at the bottom
        legend.direction = "horizontal",  # Set the legend items to be horizontal
        legend.key.height = unit(0.4, "cm"), # Reduce the height of the legend scale
        legend.key.width = unit(.8, "cm"), # Increase the width of the legend scale
        strip.text = element_text(size = 14))  # Adjust the facet title size here

dev.off()  # Close the png file and save the plot

################################################################################
## CHECK IRT MODELS IMPROVEMENTS
################################################################################

set.seed(1234)  # For reproducibility

# -------------------------
# 1. Split data
# -------------------------

n <- nrow(stemtest.youth.nl.df.sample)
youth.nl.train.index <- sample(seq_len(n), size = floor(0.8 * n))

youth.nl.train.data <- stemtest.youth.nl.df.sample[youth.nl.train.index, ]
youth.nl.test.data  <- stemtest.youth.nl.df.sample[-youth.nl.train.index, ]

# -------------------------
# 2. Prepare test data
# -------------------------

youth.nl.test.items <- youth.nl.test.data[, !names(youth.nl.test.data) %in% "person_weights"]
youth.nl.test.weights <- youth.nl.test.data$person_weights

# -------------------------
# 3. Fit 2PL models on test data
# -------------------------

youth.nl.2PL.1D.test <- mirt(youth.nl.test.items, model = 1, itemtype = "2PL",
                             weights = youth.nl.test.weights, verbose = FALSE)

youth.nl.2PL.2D.test <- mirt(youth.nl.test.items, model = 2, itemtype = "2PL",
                             weights = youth.nl.test.weights, verbose = FALSE)

youth.nl.2PL.3D.test <- mirt(youth.nl.test.items, model = 3, itemtype = "2PL",
                             weights = youth.nl.test.weights, verbose = FALSE)

youth.nl.2PL.4D.test <- mirt(youth.nl.test.items, model = 4, itemtype = "2PL",
                             weights = youth.nl.test.weights, verbose = FALSE)

# -------------------------
# 4. Define safe extraction functions
# -------------------------

# Safe log-likelihood extraction
safe_logLik <- function(model) {
  out <- tryCatch(logLik(model), error = function(e) NA_real_)
  if (length(out) == 0) return(NA_real_) else return(as.numeric(out))
}

# Extract BIC directly from the internal slot
safe_BIC <- function(model) {
  out <- tryCatch(model@Fit$BIC, error = function(e) NA_real_)
  if (length(out) == 0) return(NA_real_) else return(as.numeric(out))
}

# -------------------------
# 5. Extract logLik & BIC for each model
# -------------------------

youth.nl.logliks <- c(
  safe_logLik(youth.nl.2PL.1D.test),
  safe_logLik(youth.nl.2PL.2D.test),
  safe_logLik(youth.nl.2PL.3D.test),
  safe_logLik(youth.nl.2PL.4D.test)
)

youth.nl.bics <- c(
  safe_BIC(youth.nl.2PL.1D.test),
  safe_BIC(youth.nl.2PL.2D.test),
  safe_BIC(youth.nl.2PL.3D.test),
  safe_BIC(youth.nl.2PL.4D.test)
)

# -------------------------
# 6. Create data.frame with model fit stats
# -------------------------

youth.nl.irt.fit.stats <- data.frame(
  Model = c("2PL_1D", "2PL_2D", "2PL_3D", "2PL_4D"),
  Dimension = 1:4,
  LogLikelihood = youth.nl.logliks,
  BIC = youth.nl.bics
)

# -------------------------
# 7. Fit structureless baseline model (intercept-only GLMs)
# -------------------------

baseline_models <- lapply(youth.nl.test.items, function(item) {
  glm(item ~ 1, family = binomial(link = "logit"))
})

baseline_logLik <- sum(sapply(baseline_models, function(m) as.numeric(logLik(m))))
baseline_BIC <- sum(sapply(baseline_models, BIC))

# Add baseline row
youth.nl.irt.fit.stats <- rbind(
  data.frame(
    Model = "NoStructure_Baseline",
    Dimension = 0,
    LogLikelihood = baseline_logLik,
    BIC = baseline_BIC
  ),
  youth.nl.irt.fit.stats
)

# -------------------------
# 8. Compute improvements
# -------------------------

# Sort by number of dimensions
youth.nl.irt.fit.stats <- youth.nl.irt.fit.stats[order(youth.nl.irt.fit.stats$Dimension), ]

# Differences vs. null (absolute improvement)
youth.nl.irt.fit.stats$LogLik_vs_Null <- youth.nl.irt.fit.stats$LogLikelihood - baseline_logLik
youth.nl.irt.fit.stats$BIC_vs_Null <- youth.nl.irt.fit.stats$BIC - baseline_BIC

# Differences vs. previous model (incremental improvement)
youth.nl.irt.fit.stats$LogLik_vs_Prev <- c(NA, diff(youth.nl.irt.fit.stats$LogLikelihood))
youth.nl.irt.fit.stats$BIC_vs_Prev <- c(NA, diff(youth.nl.irt.fit.stats$BIC))

# -------------------------
# 9. Display results and diagnostics
# -------------------------

print(youth.nl.irt.fit.stats)
print(xtable(youth.nl.irt.fit.stats), type = "latex", include.rownames = FALSE)

# Check which models were extracted successfully
model_status <- data.frame(
  Model = youth.nl.irt.fit.stats$Model,
  LogLik_OK = !is.na(youth.nl.irt.fit.stats$LogLikelihood),
  BIC_OK = !is.na(youth.nl.irt.fit.stats$BIC)
)

print(model_status)